In [2]:
# train_improved_ml_labeled.py
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from math import sqrt

# -------------------------------
# 1. Load Dataset
# -------------------------------
print("Loading dataset...")
df = pd.read_csv("digital_diet_mental_health.csv")
print(f"Dataset shape: {df.shape}")

# -------------------------------
# 2. Define Risk Label (ground truth)
# -------------------------------
def assign_risk_rule(row):
    if row["stress_level"] > 7 or row["sleep_duration_hours"] < 5:
        return "High"
    elif row["stress_level"] < 4 and row["sleep_duration_hours"] >= 7:
        return "Low"
    else:
        return "Medium"

df["risk_level"] = df.apply(assign_risk_rule, axis=1)

# -------------------------------
# 3. Features for Classifier & Regressor
# -------------------------------
clf_features = [
    "daily_screen_time_hours",
    "sleep_duration_hours",
    "stress_level",
    "sleep_quality",
    "physical_activity_hours_per_week"
]

# -------------------------------
# 4. Train Classifier (RandomForest)
# -------------------------------
print("\nTraining classifier (RandomForest)...")
X_clf = df[clf_features]
y_clf = df["risk_level"]

scaler_clf = StandardScaler()
X_clf_scaled = scaler_clf.fit_transform(X_clf)

clf_model = RandomForestClassifier(n_estimators=200, random_state=42)
clf_model.fit(X_clf_scaled, y_clf)

y_pred_clf = clf_model.predict(X_clf_scaled)
print("Classification report (train set):")
print(classification_report(y_clf, y_pred_clf))

joblib.dump(clf_model, "risk_model.pkl")
joblib.dump(scaler_clf, "scaler_clf.pkl")

# -------------------------------
# 5. Regression (Mood Rating)
# -------------------------------
print("\nTraining nonlinear regressor...")

# Feature engineering
df["screen_sleep_ratio"] = df["daily_screen_time_hours"] / (df["sleep_duration_hours"] + 1)
df["stress_x_sleep"] = df["stress_level"] * df["sleep_quality"]
df["activity_balance"] = df["physical_activity_hours_per_week"] / (df["daily_screen_time_hours"] + 1)

reg_features = clf_features + ["screen_sleep_ratio", "stress_x_sleep", "activity_balance"]
X_reg = df[reg_features]
y_reg = df["mood_rating"]

scaler_reg = StandardScaler()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

reg_model = RandomForestRegressor(n_estimators=200, random_state=42)
reg_model.fit(X_reg_scaled, y_reg)

# Evaluate
y_pred_reg = reg_model.predict(X_reg_scaled)
mae = mean_absolute_error(y_reg, y_pred_reg)
rmse = sqrt(mean_squared_error(y_reg, y_pred_reg))

print(f"Regression MAE: {mae:.3f}")
print(f"Regression RMSE: {rmse:.3f}")

joblib.dump(reg_model, "mood_model.pkl")
joblib.dump(scaler_reg, "scaler_reg.pkl")

# -------------------------------
# 6. Clustering with labels
# -------------------------------
print("\nClustering experiments...")
cluster_features = ["social_media_hours", "gaming_hours", "entertainment_hours", "work_related_hours"]
X_cluster = df[cluster_features]

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

best_model = None
best_score = -1
best_method = ""

# KMeans 2–6 clusters
for k in range(2, 7):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_cluster_scaled)
    score = silhouette_score(X_cluster_scaled, labels)
    print(f"KMeans k={k} → silhouette {score:.3f}")
    if score > best_score:
        best_score = score
        best_model = km
        best_method = f"KMeans(k={k})"

# Gaussian Mixture
gmm = GaussianMixture(n_components=3, random_state=42)
labels = gmm.fit_predict(X_cluster_scaled)
score = silhouette_score(X_cluster_scaled, labels)
print(f"GMM (3 comps) → silhouette {score:.3f}")
if score > best_score:
    best_score = score
    best_model = gmm
    best_method = "GaussianMixture(3)"

# DBSCAN
dbscan = DBSCAN(eps=1.2, min_samples=10)
labels = dbscan.fit_predict(X_cluster_scaled)
if len(set(labels)) > 1 and -1 not in set(labels):
    score = silhouette_score(X_cluster_scaled, labels)
    print(f"DBSCAN → silhouette {score:.3f}")
    if score > best_score:
        best_score = score
        best_model = dbscan
        best_method = "DBSCAN"

print(f"\n✅ Best clustering method: {best_method} (silhouette={best_score:.3f})")

# Save scaler & model
joblib.dump(best_model, "cluster_model.pkl")
joblib.dump(scaler_cluster, "scaler_cluster.pkl")

# -------------------------------
# 6a. Assign descriptive labels to clusters
# -------------------------------
if "KMeans" in best_method:
    centers_scaled = best_model.cluster_centers_
    centers = scaler_cluster.inverse_transform(centers_scaled)
    centers_df = pd.DataFrame(centers, columns=cluster_features)

    # Simple heuristic labeling based on dominant activity
    def label_cluster(row):
        dominant = row.idxmax()
        if dominant == "social_media_hours":
            return "Social Media Heavy"
        elif dominant == "gaming_hours":
            return "Gaming Focused"
        elif dominant == "entertainment_hours":
            return "Entertainment Lover"
        elif dominant == "work_related_hours":
            return "Work Focused"
        else:
            return "Balanced"

    centers_df["cluster_label"] = centers_df.apply(label_cluster, axis=1)
    print("\nCluster centers with labels:")
    print(centers_df)

    # Save mapping of cluster number → label
    cluster_label_map = dict(zip(range(len(centers_df)), centers_df["cluster_label"]))
    joblib.dump(cluster_label_map, "cluster_labels.pkl")
    print("\n✅ Cluster labels saved as 'cluster_labels.pkl'")

# -------------------------------
# 7. Save Feature Lists
# -------------------------------
artifacts = {
    "clf_features": clf_features,
    "reg_features": reg_features,
    "cluster_features": cluster_features,
    "best_cluster_method": best_method
}
joblib.dump(artifacts, "artifacts.pkl")

print("\n✅ All models and scalers saved successfully!")


Loading dataset...
Dataset shape: (2000, 25)

Training classifier (RandomForest)...
Classification report (train set):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       735
         Low       1.00      1.00      1.00       237
      Medium       1.00      1.00      1.00      1028

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Training nonlinear regressor...
Regression MAE: 0.955
Regression RMSE: 1.122

Clustering experiments...
KMeans k=2 → silhouette 0.175
KMeans k=3 → silhouette 0.169
KMeans k=4 → silhouette 0.173
KMeans k=5 → silhouette 0.178
KMeans k=6 → silhouette 0.176
GMM (3 comps) → silhouette 0.133

✅ Best clustering method: KMeans(k=5) (silhouette=0.178)

Cluster centers with labels:
   social_media_hours  gaming_hours  entertainment_hours  work_related_hours  \
0            2.103385      1.034635         

In [15]:
# test_models_updated.py
import pandas as pd
import joblib
import numpy as np

# -------------------------------
# Load Models & Scalers
# -------------------------------
classification_model = joblib.load("risk_model.pkl")
scaler_clf = joblib.load("scaler_clf.pkl")

regression_model = joblib.load("mood_model.pkl")
scaler_reg = joblib.load("scaler_reg.pkl")

clustering_model = joblib.load("cluster_model.pkl")
scaler_cluster = joblib.load("scaler_cluster.pkl")

artifacts = joblib.load("artifacts.pkl")
clf_features = artifacts["clf_features"]
reg_features = artifacts["reg_features"]
cluster_features = artifacts["cluster_features"]

# -------------------------------
# Minimal User Input
# -------------------------------
sample_user = pd.DataFrame({
    "daily_screen_time_hours": [6],
    "sleep_duration_hours": [7],
    "stress_level": [5],
    "sleep_quality": [6],
    "physical_activity_hours_per_week": [3],
    "social_media_hours": [3],
    "gaming_hours": [2],
    "entertainment_hours": [2],
    "work_related_hours": [3]
})

# -------------------------------
# 1. Classification (Risk Level)
# -------------------------------
X_clf = scaler_clf.transform(sample_user[clf_features])
risk_prediction = classification_model.predict(X_clf)[0]
print(f"Predicted Risk Level: {risk_prediction}")

# -------------------------------
# 2. Regression (Mood Rating)
# -------------------------------
# Compute engineered features
sample_user["screen_sleep_ratio"] = sample_user["daily_screen_time_hours"] / (sample_user["sleep_duration_hours"] + 1)
sample_user["stress_x_sleep"] = sample_user["stress_level"] * sample_user["sleep_quality"]
sample_user["activity_balance"] = sample_user["physical_activity_hours_per_week"] / (sample_user["daily_screen_time_hours"] + 1)

X_reg = scaler_reg.transform(sample_user[reg_features])
mood_prediction = regression_model.predict(X_reg)[0]
print(f"Predicted Mood Rating: {mood_prediction:.2f}")

# -------------------------------
# 3. Clustering (Usage Segment)
# -------------------------------
# Compute ratios if needed (your cluster used raw hours, so already fine)
X_cluster = scaler_cluster.transform(sample_user[cluster_features])
cluster_prediction = clustering_model.predict(X_cluster)[0]
print(f"Predicted Usage Cluster: {cluster_prediction}")


Predicted Risk Level: Medium
Predicted Mood Rating: 6.26
Predicted Usage Cluster: 1


Through Test

In [16]:
# thorough_test_models_v2.py
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, mean_squared_error, silhouette_score

# ---------- Helper utilities ----------
def load_artifacts():
    clf = joblib.load("risk_model.pkl")
    scaler_clf = joblib.load("scaler_clf.pkl")
    reg = joblib.load("mood_model.pkl")
    scaler_reg = joblib.load("scaler_reg.pkl")
    kmeans = joblib.load("cluster_model.pkl")
    scaler_cluster = joblib.load("scaler_cluster.pkl")
    return {
        "clf": clf, "scaler_clf": scaler_clf,
        "reg": reg, "scaler_reg": scaler_reg,
        "kmeans": kmeans, "scaler_cluster": scaler_cluster
    }

def fill_defaults_for_features(df, required_features, defaults):
    out = df.copy()
    for feat in required_features:
        if feat not in out.columns:
            out[feat] = defaults.get(feat, 0)
    return out

def print_cluster_centers(kmeans, scaler_cluster, feature_names):
    centers_scaled = kmeans.cluster_centers_
    centers = scaler_cluster.inverse_transform(centers_scaled)
    centers_df = pd.DataFrame(centers, columns=feature_names)
    print("\nCluster centers (original scale):")
    print(centers_df)
    return centers_df

# ---------- Main Test Runner ----------
def main():
    print("Loading artifacts...")
    artifacts = load_artifacts()
    clf = artifacts["clf"]
    scaler_clf = artifacts["scaler_clf"]
    reg = artifacts["reg"]
    scaler_reg = artifacts["scaler_reg"]
    kmeans = artifacts["kmeans"]
    scaler_cluster = artifacts["scaler_cluster"]

    # Features trained on
    clf_features = list(scaler_clf.feature_names_in_)
    reg_features = list(scaler_reg.feature_names_in_)
    cluster_features = list(scaler_cluster.feature_names_in_)

    # Load dataset
    df = pd.read_csv("digital_diet_mental_health.csv")
    print("Dataset loaded:", df.shape)

    # ---------- Test 1: Single sample ----------
    print("\n--- Test 1: Single sample (UI-style) ---")
    sample_user = pd.DataFrame([{
        "daily_screen_time_hours": 6,
        "sleep_duration_hours": 7,
        "stress_level": 5,
        "sleep_quality": 6,
        "physical_activity_hours_per_week": 3,
        "social_media_hours": 3,
        "gaming_hours": 2,
        "entertainment_hours": 2,
        "work_related_hours": 3
    }])

    defaults = {
        "daily_screen_time_hours": 6,
        "sleep_duration_hours": 7,
        "stress_level": 5,
        "sleep_quality": 6,
        "physical_activity_hours_per_week": 3,
        "social_media_hours": 0,
        "gaming_hours": 0,
        "entertainment_hours": 0,
        "work_related_hours": 0
    }
    sample_user = fill_defaults_for_features(sample_user, set(clf_features + cluster_features), defaults)

    # Compute clustering ratios
    if "social_ratio" in cluster_features:
        sample_user["social_ratio"] = sample_user["social_media_hours"] / (sample_user["daily_screen_time_hours"] + 0.1)
    if "gaming_ratio" in cluster_features:
        sample_user["gaming_ratio"] = sample_user["gaming_hours"] / (sample_user["daily_screen_time_hours"] + 0.1)

    # Classification
    Xc = scaler_clf.transform(sample_user[clf_features])
    pred_risk = clf.predict(Xc)[0]
    print("Single sample predicted risk:", pred_risk)

    # Regression
    # Ensure all engineered features exist
    for f in reg_features:
        if f not in sample_user.columns:
            if f == "screen_sleep_ratio":
                sample_user[f] = sample_user["daily_screen_time_hours"] / (sample_user["sleep_duration_hours"] + 1)
            elif f == "stress_x_sleep":
                sample_user[f] = sample_user["stress_level"] * sample_user["sleep_quality"]
            elif f == "activity_balance":
                sample_user[f] = sample_user["physical_activity_hours_per_week"] / (sample_user["daily_screen_time_hours"] + 1)
            else:
                sample_user[f] = defaults.get(f, 0)

    Xr = scaler_reg.transform(sample_user[reg_features])
    pred_mood = reg.predict(Xr)[0]
    print("Single sample predicted mood:", round(float(pred_mood), 3))

    # Clustering
    Xk = scaler_cluster.transform(sample_user[cluster_features])
    pred_cluster = kmeans.predict(Xk)[0]
    print("Single sample predicted cluster:", pred_cluster)

    # ---------- Test 2: Batch evaluation ----------
    print("\n--- Test 2: Batch evaluation (N=400) ---")
    n = min(400, len(df))
    df_batch = df.sample(n, random_state=42).reset_index(drop=True)

    # Ensure engineered features exist for regression
    df_batch["screen_sleep_ratio"] = df_batch["daily_screen_time_hours"] / (df_batch["sleep_duration_hours"] + 1)
    df_batch["stress_x_sleep"] = df_batch["stress_level"] * df_batch["sleep_quality"]
    df_batch["activity_balance"] = df_batch["physical_activity_hours_per_week"] / (df_batch["daily_screen_time_hours"] + 1)

    # Ensure clustering ratios
    if "social_ratio" in cluster_features:
        df_batch["social_ratio"] = df_batch["social_media_hours"] / (df_batch["daily_screen_time_hours"] + 0.1)
    if "gaming_ratio" in cluster_features:
        df_batch["gaming_ratio"] = df_batch["gaming_hours"] / (df_batch["daily_screen_time_hours"] + 0.1)

    # Ensure risk_level exists
    if "risk_level" not in df_batch.columns:
        def assign_risk_rule(row):
            if row["stress_level"] > 7 or row["sleep_duration_hours"] < 5:
                return "High"
            elif row["stress_level"] < 4 and row["sleep_duration_hours"] >= 7:
                return "Low"
            else:
                return "Medium"
        df_batch["risk_level"] = df_batch.apply(assign_risk_rule, axis=1)
        print("Derived 'risk_level' column in batch for evaluation")

    # Classification
    Xc_batch = scaler_clf.transform(df_batch[clf_features])
    yc_true = df_batch["risk_level"]
    yc_pred = clf.predict(Xc_batch)
    print("\nClassification report (batch):")
    print(classification_report(yc_true, yc_pred))
    print("Accuracy:", accuracy_score(yc_true, yc_pred))

    # Regression
    Xr_batch = scaler_reg.transform(df_batch[reg_features])
    yr_true = df_batch["mood_rating"]
    yr_pred = reg.predict(Xr_batch)
    print("\nRegression MAE:", mean_absolute_error(yr_true, yr_pred))
    print("Regression RMSE:", np.sqrt(mean_squared_error(yr_true, yr_pred)))

    # Clustering
    Xk_batch = scaler_cluster.transform(df_batch[cluster_features])
    cluster_labels = kmeans.predict(Xk_batch)
    print("\nClustering silhouette score (batch):", silhouette_score(Xk_batch, cluster_labels))
    print("Cluster counts:", pd.Series(cluster_labels).value_counts().to_dict())

    # Print cluster centers
    try:
        centers_df = print_cluster_centers(kmeans, scaler_cluster, cluster_features)
    except Exception as ex:
        print("Could not print cluster centers:", ex)

    # ---------- Test 3: Edge cases ----------
    print("\n--- Test 3: Edge cases (zero / very-high values) ---")
    edge_cases = pd.DataFrame([
        {"daily_screen_time_hours": 0, "sleep_duration_hours": 8, "stress_level": 0, "sleep_quality": 9, "physical_activity_hours_per_week": 7,
         "social_media_hours": 0, "gaming_hours": 0, "entertainment_hours": 0, "work_related_hours": 0},
        {"daily_screen_time_hours": 16, "sleep_duration_hours": 3, "stress_level": 10, "sleep_quality": 1, "physical_activity_hours_per_week": 0,
         "social_media_hours": 10, "gaming_hours": 8, "entertainment_hours": 8, "work_related_hours": 6}
    ])
    edge_cases = fill_defaults_for_features(edge_cases, clf_features + cluster_features, defaults)

    # Regression engineered features
    edge_cases["screen_sleep_ratio"] = edge_cases["daily_screen_time_hours"] / (edge_cases["sleep_duration_hours"] + 1)
    edge_cases["stress_x_sleep"] = edge_cases["stress_level"] * edge_cases["sleep_quality"]
    edge_cases["activity_balance"] = edge_cases["physical_activity_hours_per_week"] / (edge_cases["daily_screen_time_hours"] + 1)

    Xe_clf = scaler_clf.transform(edge_cases[clf_features])
    print("Edge-case classification preds:", clf.predict(Xe_clf))
    Xe_reg = scaler_reg.transform(edge_cases[reg_features])
    print("Edge-case regression preds:", reg.predict(Xe_reg))

    # ---------- Test 4: Missing-feature robustness ----------
    print("\n--- Test 4: Missing-feature test ---")
    sample_missing = pd.DataFrame([{
        "daily_screen_time_hours": 6,
        "sleep_duration_hours": 7,
        "stress_level": 5,
        "sleep_quality": 6,
        "social_media_hours": 3,
        "gaming_hours": 2,
        "entertainment_hours": 2,
        "work_related_hours": 3
    }])
    # Fill missing features
    sample_filled = fill_defaults_for_features(sample_missing, clf_features, defaults)
    Xf_clf = scaler_clf.transform(sample_filled[clf_features])
    print("Prediction after filling defaults (risk):", clf.predict(Xf_clf)[0])

    print("\n✅ All tests completed successfully.")

if __name__ == "__main__":
    main()


Loading artifacts...
Dataset loaded: (2000, 25)

--- Test 1: Single sample (UI-style) ---
Single sample predicted risk: Medium
Single sample predicted mood: 6.26
Single sample predicted cluster: 1

--- Test 2: Batch evaluation (N=400) ---
Derived 'risk_level' column in batch for evaluation

Classification report (batch):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       152
         Low       1.00      1.00      1.00        50
      Medium       1.00      1.00      1.00       198

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Accuracy: 1.0

Regression MAE: 0.9456866666666667
Regression RMSE: 1.1177438766948755

Clustering silhouette score (batch): 0.16099184064317143
Cluster counts: {4: 87, 0: 81, 2: 80, 3: 78, 1: 74}

Cluster centers (original scale):
   social_media_hours  gaming_hours  entertainment_hours  work_re

In [17]:
# user_friendly_test_v2.py
import pandas as pd
import joblib

# -------------------------------
# Load trained models & scalers
# -------------------------------
classification_model = joblib.load("risk_model.pkl")
scaler_clf = joblib.load("scaler_clf.pkl")

regression_model = joblib.load("mood_model.pkl")
scaler_reg = joblib.load("scaler_reg.pkl")

clustering_model = joblib.load("cluster_model.pkl")
scaler_cluster = joblib.load("scaler_cluster.pkl")

# -------------------------------
# Minimal user input (only key fields)
# -------------------------------
user_input = {
    "daily_screen_time_hours": 5,
    "sleep_duration_hours": 7,
    "stress_level": 5,
    "sleep_quality": 6,
    "physical_activity_hours_per_week": 3,
    "social_media_hours": 3,
    "gaming_hours": 2
}

sample_user = pd.DataFrame([user_input])

# -------------------------------
# 1️⃣ Compute derived features for regression
# -------------------------------
sample_user["screen_sleep_ratio"] = sample_user["daily_screen_time_hours"] / (sample_user["sleep_duration_hours"] + 1)
sample_user["stress_x_sleep"] = sample_user["stress_level"] * sample_user["sleep_quality"]
sample_user["activity_balance"] = sample_user["physical_activity_hours_per_week"] / (sample_user["daily_screen_time_hours"] + 1)

# -------------------------------
# 2️⃣ Compute derived features for clustering
# -------------------------------
sample_user["social_ratio"] = sample_user["social_media_hours"] / (sample_user["daily_screen_time_hours"] + 0.1)
sample_user["gaming_ratio"] = sample_user["gaming_hours"] / (sample_user["daily_screen_time_hours"] + 0.1)

# Provide default values for features not collected from user
sample_user["entertainment_hours"] = 2
sample_user["work_related_hours"] = 3

# -------------------------------
# 3️⃣ Classification: Risk Level
# -------------------------------
clf_features = scaler_clf.feature_names_in_
X_clf = scaler_clf.transform(sample_user[clf_features])
risk_prediction = classification_model.predict(X_clf)[0]

# -------------------------------
# 4️⃣ Regression: Mood Rating
# -------------------------------
reg_features = scaler_reg.feature_names_in_
X_reg = scaler_reg.transform(sample_user[reg_features])
mood_prediction = regression_model.predict(X_reg)[0]

# -------------------------------
# 5️⃣ Clustering: Usage Segment
# -------------------------------
cluster_features = scaler_cluster.feature_names_in_
X_cluster = scaler_cluster.transform(sample_user[cluster_features])
cluster_prediction = clustering_model.predict(X_cluster)[0]

# -------------------------------
# Display results
# -------------------------------
print("---------- User Predictions ----------")
print(f"Predicted Risk Level       : {risk_prediction}")
print(f"Predicted Mood Rating      : {mood_prediction:.2f}")
print(f"Predicted Usage Cluster    : {cluster_prediction}")


---------- User Predictions ----------
Predicted Risk Level       : Medium
Predicted Mood Rating      : 5.72
Predicted Usage Cluster    : 1


In [18]:
# overfitting_check.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, mean_squared_error
from math import sqrt

# -------------------------------
# 1️⃣ Load Dataset
# -------------------------------
df = pd.read_csv("digital_diet_mental_health.csv")
print(f"Dataset loaded: {df.shape}")

# -------------------------------
# 2️⃣ Prepare Risk Labels
# -------------------------------
def assign_risk_rule(row):
    if row["stress_level"] > 7 or row["sleep_duration_hours"] < 5:
        return "High"
    elif row["stress_level"] < 4 and row["sleep_duration_hours"] >= 7:
        return "Low"
    else:
        return "Medium"

df["risk_level"] = df.apply(assign_risk_rule, axis=1)

# -------------------------------
# 3️⃣ Feature Engineering
# -------------------------------
df["screen_sleep_ratio"] = df["daily_screen_time_hours"] / (df["sleep_duration_hours"] + 1)
df["stress_x_sleep"] = df["stress_level"] * df["sleep_quality"]
df["activity_balance"] = df["physical_activity_hours_per_week"] / (df["daily_screen_time_hours"] + 1)

# -------------------------------
# 4️⃣ Features for models
# -------------------------------
clf_features = ["daily_screen_time_hours", "sleep_duration_hours", "stress_level",
                "sleep_quality", "physical_activity_hours_per_week"]
reg_features = clf_features + ["screen_sleep_ratio", "stress_x_sleep", "activity_balance"]

# -------------------------------
# 5️⃣ Split dataset
# -------------------------------
X_clf = df[clf_features]
y_clf = df["risk_level"]

X_reg = df[reg_features]
y_reg = df["mood_rating"]

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

# -------------------------------
# 6️⃣ Load saved scalers and models
# -------------------------------
clf_model = joblib.load("risk_model.pkl")
scaler_clf = joblib.load("scaler_clf.pkl")

reg_model = joblib.load("mood_model.pkl")
scaler_reg = joblib.load("scaler_reg.pkl")

# -------------------------------
# 7️⃣ Scale features
# -------------------------------
X_train_clf_scaled = scaler_clf.transform(X_train_clf)
X_test_clf_scaled = scaler_clf.transform(X_test_clf)

X_train_reg_scaled = scaler_reg.transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

# -------------------------------
# 8️⃣ Evaluate Classifier
# -------------------------------
y_train_pred_clf = clf_model.predict(X_train_clf_scaled)
y_test_pred_clf = clf_model.predict(X_test_clf_scaled)

print("\n--- Classification Performance ---")
print("Train Accuracy:", accuracy_score(y_train_clf, y_train_pred_clf))
print("Test Accuracy :", accuracy_score(y_test_clf, y_test_pred_clf))
print("\nClassification Report (Test Set):")
print(classification_report(y_test_clf, y_test_pred_clf))

# -------------------------------
# 9️⃣ Evaluate Regressor
# -------------------------------
y_train_pred_reg = reg_model.predict(X_train_reg_scaled)
y_test_pred_reg = reg_model.predict(X_test_reg_scaled)

train_mae = mean_absolute_error(y_train_reg, y_train_pred_reg)
test_mae = mean_absolute_error(y_test_reg, y_test_pred_reg)
train_rmse = sqrt(mean_squared_error(y_train_reg, y_train_pred_reg))
test_rmse = sqrt(mean_squared_error(y_test_reg, y_test_pred_reg))

print("\n--- Regression Performance ---")
print(f"Train MAE: {train_mae:.3f}, Train RMSE: {train_rmse:.3f}")
print(f"Test  MAE: {test_mae:.3f}, Test  RMSE: {test_rmse:.3f}")


Dataset loaded: (2000, 25)

--- Classification Performance ---
Train Accuracy: 1.0
Test Accuracy : 1.0

Classification Report (Test Set):
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       147
         Low       1.00      1.00      1.00        47
      Medium       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


--- Regression Performance ---
Train MAE: 0.957, Train RMSE: 1.123
Test  MAE: 0.946, Test  RMSE: 1.118
