In [1]:
# 📦 Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNet
from IPython.display import FileLink

# ✅ Step 2: Load correct files
train = pd.read_csv('/kaggle/input/mc-datathon-2025-phone-addiction/train.csv')
test = pd.read_csv('/kaggle/input/mc-datathon-2025-phone-addiction/test.csv')

# 🧠 Step 3: Feature engineering
for df in [train, test]:
    df['Social_Edu_Ratio'] = np.log1p(df['Time_on_Social_Media']) / np.log1p(df['Time_on_Education'] + 0.01)
    df['Gaming_Usage_Ratio'] = np.log1p(df['Time_on_Gaming'] + 1) / np.log1p(df['Daily_Usage_Hours'] + 1)
    df['Mental_Health_Index'] = 0.5 * df['Anxiety_Level'] + 0.5 * df['Depression_Level']
    df['Parental_Sleep_Effect'] = df['Parental_Control'] * np.sqrt(df['Sleep_Hours'])
    df['Academic_Efficiency'] = df['Academic_Performance'] / (df['Daily_Usage_Hours'] + 1)
    df['Social_Productivity'] = df['Social_Interactions'] / (df['Time_on_Social_Media'] + 0.1)

# 🔢 Step 4: Feature types
categorical_cols = ['Gender', 'Location', 'School_Grade', 'Phone_Usage_Purpose']
numerical_cols = [col for col in train.columns if col not in ['id', 'Name', 'Addiction_Level'] + categorical_cols]

# 🧼 Step 5: Preprocessing
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('quantile', QuantileTransformer(output_distribution='normal')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

# 🤖 Step 6: Model setup
base_models = [
    ('xgb', XGBRegressor(
        n_estimators=1200,
        learning_rate=0.008,
        max_depth=4,
        subsample=0.75,
        colsample_bytree=0.65,
        gamma=0.2,
        random_state=42,
        tree_method='hist'
    )),
    ('lgbm', LGBMRegressor(
        n_estimators=1800,
        learning_rate=0.006,
        num_leaves=25,
        feature_fraction=0.65,
        bagging_fraction=0.8,
        bagging_freq=6,
        min_data_in_leaf=20,
        random_state=42,
        verbose=-1
    ))
]

final_estimator = ElasticNet(alpha=0.0005, l1_ratio=0.75, random_state=42, max_iter=2000)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', StackingRegressor(
        estimators=base_models,
        final_estimator=final_estimator,
        cv=7,
        n_jobs=-1,
        passthrough=True
    ))
])

# 🎯 Step 7: Training
X = train.drop(['id', 'Name', 'Addiction_Level'], axis=1)
y = train['Addiction_Level']

scores = -cross_val_score(model, X, y, cv=7, scoring='neg_mean_squared_error', n_jobs=-1)
print(f"Cross-validated RMSE: {np.mean(np.sqrt(scores)):.5f} ± {np.std(np.sqrt(scores)):.5f}")

model.fit(X, y)

# 📈 Step 8: Predictions
X_test = test.drop(['id', 'Name'], axis=1)
preds = model.predict(X_test)
preds = np.clip(preds, train['Addiction_Level'].min() * 0.95, train['Addiction_Level'].max() * 1.05)

# 💾 Step 9: Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Addiction_Level': preds
})
submission.to_csv('submission.csv', index=False)

# 🔗 Step 10: Download link
print("✅ Submission ready! Click below to download:")
FileLink('submission.csv')


Cross-validated RMSE: 0.33886 ± 0.01455
✅ Submission ready! Click below to download:
