In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/phone-addiction-challenge/train.csv
/kaggle/input/phone-addiction-challenge/test.csv


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNet

# Load data
train = pd.read_csv('/kaggle/input/phone-addiction-challenge/train.csv')
test = pd.read_csv('/kaggle/input/phone-addiction-challenge/test.csv')

# Advanced feature engineering
for df in [train, test]:
    # Time-based ratios (log-scaled)
    df['Social_Edu_Ratio'] = np.log1p(df['Time_on_Social_Media']) / np.log1p(df['Time_on_Education'] + 0.01)
    df['Gaming_Usage_Ratio'] = np.log1p(df['Time_on_Gaming'] + 1) / np.log1p(df['Daily_Usage_Hours'] + 1)
    
    # Behavioral interactions
    df['Mental_Health_Index'] = 0.5*df['Anxiety_Level'] + 0.5*df['Depression_Level']
    df['Parental_Sleep_Effect'] = df['Parental_Control'] * np.sqrt(df['Sleep_Hours'])
    
    # Productivity metrics
    df['Academic_Efficiency'] = df['Academic_Performance'] / (df['Daily_Usage_Hours'] + 1)
    df['Social_Productivity'] = df['Social_Interactions'] / (df['Time_on_Social_Media'] + 0.1)

# Define feature types
categorical_cols = ['Gender', 'Location', 'School_Grade', 'Phone_Usage_Purpose']
numerical_cols = [col for col in train.columns 
                 if col not in ['id', 'Name', 'Addiction_Level'] + categorical_cols]

# Enhanced preprocessing (updated to remove warnings)
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('quantile', QuantileTransformer(output_distribution='normal')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='drop')

# Optimized ensemble models
base_models = [
    ('xgb', XGBRegressor(
        n_estimators=1200,
        learning_rate=0.008,
        max_depth=4,
        subsample=0.75,
        colsample_bytree=0.65,
        gamma=0.2,
        random_state=42,
        tree_method='hist'
    )),
    ('lgbm', LGBMRegressor(
        n_estimators=1800,
        learning_rate=0.006,
        num_leaves=25,
        feature_fraction=0.65,
        bagging_fraction=0.8,
        bagging_freq=6,
        min_data_in_leaf=20,
        random_state=42,
        verbose=-1
    ))
]

# Meta-learner
final_estimator = ElasticNet(
    alpha=0.0005, 
    l1_ratio=0.75, 
    random_state=42,
    max_iter=2000
)

# Full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', StackingRegressor(
        estimators=base_models,
        final_estimator=final_estimator,
        cv=7,
        n_jobs=-1,
        passthrough=True
    ))
])

# Train with cross-validation
X = train.drop(['id', 'Name', 'Addiction_Level'], axis=1)
y = train['Addiction_Level']

scores = -cross_val_score(model, X, y, 
                        cv=7, 
                        scoring='neg_mean_squared_error',
                        n_jobs=-1)
print(f"Cross-validated RMSE: {np.mean(np.sqrt(scores)):.5f} ± {np.std(np.sqrt(scores)):.5f}")

# Final training
model.fit(X, y)

# Generate predictions with clipping
test_preds = model.predict(test.drop(['id', 'Name'], axis=1))
test_preds = np.clip(test_preds, 
                    train['Addiction_Level'].min() * 0.95, 
                    train['Addiction_Level'].max() * 1.05)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Addiction_Level': test_preds
})
submission.to_csv('submission.csv', index=False)

# Download link
from IPython.display import FileLink
print("✅ Submission ready! Click below to download:")
FileLink('submission.csv')

Cross-validated RMSE: 0.33886 ± 0.01455
✅ Submission ready! Click below to download:
