In [3]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


# Load preprocessed data and encoders

preprocessing_objects = joblib.load("../outputs/preprocessing_all.pkl")

# Extract everything
df_final_encoded = preprocessing_objects["data"]
binary_encoder = preprocessing_objects["binary_encoder"]
multi_encoder = preprocessing_objects["multi_encoder"]

print("Preprocessed data and encoders loaded successfully!")
df_final_encoded.head()


Preprocessed data and encoders loaded successfully!


Unnamed: 0,Age,Avg_Daily_Usage_Hours,Sleep_Hours_Per_Night,Mental_Health_Score,Conflicts_Over_Social_Media,Addicted_Score,Gender_Male,Affects_Academic_Performance_Yes,Academic_Level_High School,Academic_Level_Undergraduate,...,Most_Used_Platform_LinkedIn,Most_Used_Platform_Snapchat,Most_Used_Platform_TikTok,Most_Used_Platform_Twitter,Most_Used_Platform_VKontakte,Most_Used_Platform_WeChat,Most_Used_Platform_WhatsApp,Most_Used_Platform_YouTube,Relationship_Status_In Relationship,Relationship_Status_Single
0,19,5.2,6.5,6,3,8,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,22,2.1,7.5,8,0,3,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20,6.0,5.0,5,4,9,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18,3.0,7.0,7,1,4,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,21,4.5,6.0,6,2,7,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Training and testing Model

In [4]:
# Features (X) and target (y)
X = df_final_encoded.drop('Addicted_Score', axis=1)
y = df_final_encoded['Addicted_Score']

# Optional: check data types
print(X.dtypes)


Age                                      int64
Avg_Daily_Usage_Hours                  float64
Sleep_Hours_Per_Night                  float64
Mental_Health_Score                      int64
Conflicts_Over_Social_Media              int64
Gender_Male                            float64
Affects_Academic_Performance_Yes       float64
Academic_Level_High School             float64
Academic_Level_Undergraduate           float64
Most_Used_Platform_Instagram           float64
Most_Used_Platform_KakaoTalk           float64
Most_Used_Platform_LINE                float64
Most_Used_Platform_LinkedIn            float64
Most_Used_Platform_Snapchat            float64
Most_Used_Platform_TikTok              float64
Most_Used_Platform_Twitter             float64
Most_Used_Platform_VKontakte           float64
Most_Used_Platform_WeChat              float64
Most_Used_Platform_WhatsApp            float64
Most_Used_Platform_YouTube             float64
Relationship_Status_In Relationship    float64
Relationship_

**Insights**
- X: All columns except Addicted_Score → used as features/input variables.
- y: Addicted_Score column → used as target/output variable.
- Prepares the dataset for training a classifier.

In [5]:
# Split Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**Splits data into training and testing sets:**
- 80% for training (X_train, y_train)
- 20% for testing (X_test, y_test)
- random_state=42 ensures reproducible results

In [6]:
# Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


1. **Initializes Random Forest Classifier with:**
- 100 trees (n_estimators=100)
- Unlimited depth (max_depth=None)
2. **Trains the model on the training data.**

In [7]:
# Make Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

- Uses the trained model to predict target labels for the test set.

In [8]:
# Evaluate the Model

# Train metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print("\nRandom Forest Train Results:")
print(f"Train MSE: {mse_train:.2f}")
print(f"Train RMSE: {rmse_train:.2f}")
print(f"Train R² Score: {r2_train:.2f}")

# Test metrics
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\nRandom Forest Test Results:")
print(f"Test MSE: {mse_test:.2f}")
print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test R² Score: {r2_test:.2f}")


Random Forest Train Results:
Train MSE: 0.01
Train RMSE: 0.08
Train R² Score: 1.00

Random Forest Test Results:
Test MSE: 0.05
Test RMSE: 0.22
Test R² Score: 0.98


**Code Explination**
1. mean_squared_error(y_test, y_pred) → calculates the **average squared difference** between the actual values (y_test) and the predicted values (y_pred). Lower MSE means better predictions.
2. rmse = np.sqrt(mse) → takes the square root of MSE to get **Root Mean Squared Error (RMSE)**, which is in the **same unit as the target variable**.
3. r2 = r2_score(y_test, y_pred) → calculates the **R² score**, which tells how well the model **explains the variance** in the target:
    - 1 → perfect prediction
    - 0 → model is no better than mean
    - Negative → model is worse than just predicting the mean
4. print(...) → displays the **MSE, RMSE, and R² score** so you can see the model’s performance.

**Output explination**
### **Mean Squared Error (MSE): 0.05**

- The **average squared difference** between the predicted and actual values is **very small (0.05)**.
- Indicates that your predictions are **very close to the actual scores**.

---

### **2️⃣ Root Mean Squared Error (RMSE): 0.22**

- RMSE is the square root of MSE, so on the **same scale as your target** (e.g., 0–10 for addiction score).
- A **low RMSE (0.22)** means your model’s predictions are, on average, only **0.22 units off** from the actual score.

---

### **3️⃣ R² Score: 0.98**

- R² (coefficient of determination) indicates **how much of the variance in the target is explained by the model**.
- **0.98 → 98% of the variance** in addiction scores is captured by your model.
- Very close to **1**, meaning an **excellent fit**.

---

### **✅ Interpretation**

- Your Random Forest regression model is **highly accurate**.
- Predicted addiction scores are **very close to actual scores**.
- Model is **reliable for prediction** on similar data.
- model explains 98% of variance in Addicted_Score.
- This might indicate **overfitting** (too good to be true sometimes).

In [9]:
print(X.dtypes)

Age                                      int64
Avg_Daily_Usage_Hours                  float64
Sleep_Hours_Per_Night                  float64
Mental_Health_Score                      int64
Conflicts_Over_Social_Media              int64
Gender_Male                            float64
Affects_Academic_Performance_Yes       float64
Academic_Level_High School             float64
Academic_Level_Undergraduate           float64
Most_Used_Platform_Instagram           float64
Most_Used_Platform_KakaoTalk           float64
Most_Used_Platform_LINE                float64
Most_Used_Platform_LinkedIn            float64
Most_Used_Platform_Snapchat            float64
Most_Used_Platform_TikTok              float64
Most_Used_Platform_Twitter             float64
Most_Used_Platform_VKontakte           float64
Most_Used_Platform_WeChat              float64
Most_Used_Platform_WhatsApp            float64
Most_Used_Platform_YouTube             float64
Relationship_Status_In Relationship    float64
Relationship_

In [12]:
import joblib
import os
os.makedirs("../outputs", exist_ok=True)

# Save everything needed for feature selection
joblib.dump({
    "X": X,                 # all features before selection
    "y": y,                 # target
    "X_train": X_train,      # train features
    "X_test": X_test,        # test features
    "y_train": y_train,      # train target
    "y_test": y_test,        # test target
    "rf_model": model        # trained Random Forest
}, "../outputs/model_training_data.pkl")

print("All training data saved from modeltrainingwithoutfeatureselection.ipynb!")


All training data saved from modeltrainingwithoutfeatureselection.ipynb!
