In [15]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load preprocessed data
preprocessing_objects = joblib.load("../outputs/preprocessing_all.pkl")

df_final_encoded = preprocessing_objects["data"]
binary_encoder = preprocessing_objects["binary_encoder"]
multi_encoder = preprocessing_objects["multi_encoder"]

print("Preprocessed data loaded successfully!")

# Load model training data with feature selection
training_data = joblib.load("../outputs/model_training_data_with_features.pkl")

X_final = training_data["X_final"]  # Only feature-selected columns
y = training_data["y"]              # Target variable

print("Feature-selected data loaded successfully!")
print("Selected features:", X_final.columns.tolist())

Preprocessed data loaded successfully!
Feature-selected data loaded successfully!
Selected features: ['Mental_Health_Score', 'Sleep_Hours_Per_Night', 'Avg_Daily_Usage_Hours', 'Affects_Academic_Performance_Yes']


In [16]:
df_final_encoded

Unnamed: 0,Age,Avg_Daily_Usage_Hours,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score,Gender_Male,Affects_Academic_Performance_Yes,Academic_Level_High School,Academic_Level_Undergraduate,...,Most_Used_Platform_LinkedIn,Most_Used_Platform_Snapchat,Most_Used_Platform_TikTok,Most_Used_Platform_Twitter,Most_Used_Platform_VKontakte,Most_Used_Platform_WeChat,Most_Used_Platform_WhatsApp,Most_Used_Platform_YouTube,Relationship_Status_In Relationship,Relationship_Status_Single
0,19,5.2,6.5,6,3,8,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,22,2.1,7.5,8,0,3,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20,6.0,5.0,5,4,9,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18,3.0,7.0,7,1,4,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,21,4.5,6.0,6,2,7,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,20,4.7,7.2,7,2,5,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
701,23,6.8,5.9,4,5,9,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
702,21,5.6,6.7,6,3,7,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
703,24,4.3,7.5,8,2,4,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564 entries, 24 to 102
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Age                                  564 non-null    int64  
 1   Avg_Daily_Usage_Hours                564 non-null    float64
 2   Sleep_Hours_Per_Night                564 non-null    float64
 3   Mental_Health_Score                  564 non-null    int64  
 4   Conflicts_Over_Social_Media          564 non-null    int64  
 5   Gender_Male                          564 non-null    float64
 6   Affects_Academic_Performance_Yes     564 non-null    float64
 7   Academic_Level_High School           564 non-null    float64
 8   Academic_Level_Undergraduate         564 non-null    float64
 9   Most_Used_Platform_Instagram         564 non-null    float64
 10  Most_Used_Platform_KakaoTalk         564 non-null    float64
 11  Most_Used_Platform_LINE             

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=200, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Predict on test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate performance
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting Results:")
print(f"MSE: {mse_gb:.4f}")
print(f"RMSE: {rmse_gb:.4f}")
print(f"R² Score: {r2_gb:.4f}")
X_train.info()

Gradient Boosting Results:
MSE: 0.1372
RMSE: 0.3704
R² Score: 0.9452
<class 'pandas.core.frame.DataFrame'>
Index: 564 entries, 24 to 102
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Mental_Health_Score               564 non-null    int64  
 1   Sleep_Hours_Per_Night             564 non-null    float64
 2   Avg_Daily_Usage_Hours             564 non-null    float64
 3   Affects_Academic_Performance_Yes  564 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 22.0 KB


**Code Explination**
- n_estimators=200: Number of trees in the ensemble. More trees → potentially better learning but slower.
- random_state=42: Ensures **reproducibility** of results.
- The model **learns patterns** from the training data (X_train, y_train).
- Each tree is built sequentially, **correcting the errors of the previous tree** (gradient boosting).
- The trained model predicts addiction scores for the **unseen test data** (X_test).
- y_pred_gb is an array of predicted values corresponding to each test sample.
- **MSE (Mean Squared Error):** Average of squared differences between actual and predicted values.
- **RMSE (Root Mean Squared Error):** Square root of MSE, gives error in the same scale as the target.
- **R² Score (Coefficient of Determination):** Measures how well the model explains the variance in the target.
    - R² = 1 → perfect prediction
    - R² = 0 → model cannot explain variance
  
**Output Explanation**

- **MSE = 0.1372:** On average, the squared difference between the predicted and actual addiction scores is 0.1372, which indicates the error magnitude in squared units.
- **RMSE = 0.3704:** The predictions deviate from the actual scores by about 0.37 points on average, giving an interpretable error in the same units as the target.
- **R² = 0.9452:** The model explains approximately 94.52% of the variance in addiction scores, showing that it captures most of the patterns in the data and is highly accurate.

## Hypertuning GradientBoostingRegressor

In [20]:

# Base Model

gb_model = GradientBoostingRegressor(random_state=42)


# Hyperparameter Search Space

param_dist = {
    "n_estimators": [100, 200, 300, 400],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 4, 5, 6],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "max_features": ["sqrt", "log2", None]
}


# Cross-Validation Setup

cv = KFold(n_splits=5, shuffle=True, random_state=42)


# Randomized Search

random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=50,                      # try 50 random combinations
    scoring="neg_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit to training data
random_search.fit(X_train, y_train)

# Best Model

print("Best Hyperparameters:", random_search.best_params_)

best_gb = random_search.best_estimator_


# Evaluate on Test Set
y_pred_tuned = best_gb.predict(X_test)

mse = mean_squared_error(y_test, y_pred_tuned)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_tuned)

print("\nGradient Boosting (Tuned) Results:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'subsample': 0.7, 'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 4, 'learning_rate': 0.01}

Gradient Boosting (Tuned) Results:
MSE: 0.1321
RMSE: 0.3635
R² Score: 0.9472


**Code Explination**
- `random_state=42` ensures reproducibility.
- At this point, the model is using **default hyperparameters**.
- Each key is a hyperparameter of Gradient Boosting:
    - **`n_estimators`**: Number of trees. More trees → better fit but slower.
    - **`learning_rate`**: Shrinks contribution of each tree; smaller → slower but often more accurate.
    - **`max_depth`**: Maximum depth of each tree. Controls complexity.
    - **`min_samples_split`**: Minimum samples needed to split a node.
    - **`min_samples_leaf`**: Minimum samples required at a leaf node.
    - **`subsample`**: Fraction of samples used per tree. Helps regularization.
    - **`max_features`**: Number of features considered per split. Helps reduce overfitting.
- 5-fold CV: splits data into 5 parts, trains on 4, tests on 1 → repeated 5 times.
- **shuffle=True** ensures random distribution of samples in folds.
- Randomly selects **50 parameter combinations** from the grid.
- Evaluates each combination using 5-fold cross-validation.
- **Scoring**: `neg_mean_squared_error` → RandomizedSearchCV minimizes MSE (scikit-learn uses negative MSE internally).
- **n_jobs=-1** → parallel processing on all cores.
- Trains the model on **training data** with all 50 sampled hyperparameter combinations.
- Uses 5-fold CV to evaluate each combination.
- Returns the **best performing set of hyperparameters**.
- `best_params_`: The combination of parameters that gave the lowest MSE.
- `best_estimator_`: The trained Gradient Boosting model with those best parameters.
- Uses the tuned model to predict addicted scores for unseen test data.
- **MSE**: Average squared error → smaller is better.
- **RMSE**: Error in same units as target → intuitive measure of prediction error.
- **R² Score**: How much variance in addicted score is explained by the features → closer to 1 is better.


### **Summary in words**

### 1. **Mean Squared Error (MSE)**

- **Before Tuning**: `0.1372`
- **After Tuning**: `0.1321`
    
    ✅ Tuned model has slightly **lower error**, meaning predictions are on average closer to the actual values.
    

### 2. **Root Mean Squared Error (RMSE)**

- **Before Tuning**: `0.3704`
- **After Tuning**: `0.3635`
    
    ✅ RMSE improved a bit, showing the tuned model’s predictions deviate **less from the actual values**.
    

### 3. **R² Score (Explained Variance)**

- **Before Tuning**: `0.9452`
- **After Tuning**: `0.9472`
    
    ✅ R² got slightly higher — tuned model explains **more variance** in your target variable.