In [1]:
# train_improved.py
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from math import sqrt

import sys
print("Python executable:", sys.executable)

from xgboost import XGBRegressor

# -------------------------------
# 1. Load Dataset
# -------------------------------
print("Loading dataset...")
df = pd.read_csv("digital_diet_mental_health.csv")
print(f"Dataset shape: {df.shape}")

# -------------------------------
# 2. Rule-based Risk Label
# -------------------------------
def assign_risk_rule(row):
    if row["stress_level"] > 7 or row["sleep_duration_hours"] < 5:
        return "High"
    elif row["stress_level"] < 4 and row["sleep_duration_hours"] >= 7:
        return "Low"
    else:
        return "Medium"

df["risk_level"] = df.apply(assign_risk_rule, axis=1)

# -------------------------------
# 3. Features for Classifier
# -------------------------------
clf_features = [
    "daily_screen_time_hours",
    "sleep_duration_hours",
    "stress_level",
    "sleep_quality",
    "physical_activity_hours_per_week"
]

scaler_clf = StandardScaler()
scaler_clf.fit(df[clf_features])
joblib.dump(scaler_clf, "scaler_clf.pkl")

print("✅ Classifier will use rule-based risk rules (no ML training).")

# -------------------------------
# 4. Regression (Mood Rating)
# -------------------------------
print("\nTraining improved mood regression model...")

# --- Feature Engineering ---
df["screen_sleep_ratio"] = df["daily_screen_time_hours"] / (df["sleep_duration_hours"] + 1)
df["stress_x_sleep"] = df["stress_level"] * df["sleep_quality"]
df["activity_balance"] = df["physical_activity_hours_per_week"] / (df["daily_screen_time_hours"] + 1)
df["wellness_score"] = (
    (df["sleep_quality"] + df["physical_activity_hours_per_week"]) / 2
    - (df["stress_level"] + df["daily_screen_time_hours"] / 2)
)

reg_features = clf_features + [
    "screen_sleep_ratio",
    "stress_x_sleep",
    "activity_balance",
    "wellness_score"
]

X_reg = df[reg_features]
y_reg = df["mood_rating"]

scaler_reg = StandardScaler()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

# --- Model Training (XGBoost) ---
reg_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
reg_model.fit(X_reg_scaled, y_reg)

# --- Evaluation ---
y_pred_reg = reg_model.predict(X_reg_scaled)
mae = mean_absolute_error(y_reg, y_pred_reg)
rmse = sqrt(mean_squared_error(y_reg, y_pred_reg))

print(f"Regression MAE: {mae:.3f}")
print(f"Regression RMSE: {rmse:.3f}")

# --- Save Regression Artifacts ---
joblib.dump(reg_model, "mood_model.pkl")
joblib.dump(scaler_reg, "scaler_reg.pkl")

print("✅ Improved mood model (XGBoost) saved successfully!")

# -------------------------------
# 5. Clustering
# -------------------------------
print("\nRunning clustering experiments...")
cluster_features = ["social_media_hours", "gaming_hours", "entertainment_hours", "work_related_hours"]
X_cluster = df[cluster_features]

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

best_model = None
best_score = -1
best_method = ""

# Try KMeans
for k in range(2, 7):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_cluster_scaled)
    score = silhouette_score(X_cluster_scaled, labels)
    print(f"KMeans k={k} → silhouette {score:.3f}")
    if score > best_score:
        best_score = score
        best_model = km
        best_method = f"KMeans(k={k})"

# Try Gaussian Mixture
gmm = GaussianMixture(n_components=3, random_state=42)
labels = gmm.fit_predict(X_cluster_scaled)
score = silhouette_score(X_cluster_scaled, labels)
print(f"GMM (3 comps) → silhouette {score:.3f}")
if score > best_score:
    best_score = score
    best_model = gmm
    best_method = "GaussianMixture(3)"

# Try DBSCAN
dbscan = DBSCAN(eps=1.2, min_samples=10)
labels = dbscan.fit_predict(X_cluster_scaled)
if len(set(labels)) > 1 and -1 not in set(labels):
    score = silhouette_score(X_cluster_scaled, labels)
    print(f"DBSCAN → silhouette {score:.3f}")
    if score > best_score:
        best_score = score
        best_model = dbscan
        best_method = "DBSCAN"

print(f"\n✅ Best clustering method: {best_method} (silhouette={best_score:.3f})")

joblib.dump(best_model, "cluster_model.pkl")
joblib.dump(scaler_cluster, "scaler_cluster.pkl")

# -------------------------------
# 6. Save Feature Lists
# -------------------------------
artifacts = {
    "clf_features": clf_features,
    "reg_features": reg_features,
    "cluster_features": cluster_features,
    "best_cluster_method": best_method
}
joblib.dump(artifacts, "artifacts.pkl")

print("\n✅ All improved models and scalers saved successfully!")


Python executable: c:\Users\Chirag chaudhari\Codes\Python\ScreenAware\.venv\Scripts\python.exe
Loading dataset...
Dataset shape: (2000, 25)
✅ Classifier will use rule-based risk rules (no ML training).

Training improved mood regression model...
Regression MAE: 1.454
Regression RMSE: 1.756
✅ Improved mood model (XGBoost) saved successfully!

Running clustering experiments...
KMeans k=2 → silhouette 0.175
KMeans k=3 → silhouette 0.169
KMeans k=4 → silhouette 0.173
KMeans k=5 → silhouette 0.178
KMeans k=6 → silhouette 0.176
GMM (3 comps) → silhouette 0.133

✅ Best clustering method: KMeans(k=5) (silhouette=0.178)

✅ All improved models and scalers saved successfully!


In [4]:
import joblib
import numpy as np
import pandas as pd

# -------------------------------
# 1. Load Artifacts
# -------------------------------
print("Loading trained artifacts...")
reg_model = joblib.load("mood_model.pkl")
scaler_reg = joblib.load("scaler_reg.pkl")
artifacts = joblib.load("artifacts.pkl")

reg_features = artifacts["reg_features"]
print(f"✅ Loaded regression features: {reg_features}")

# -------------------------------
# 2. Test Samples (you can modify/add more)
# -------------------------------
test_data = [
    {
        "daily_screen_time_hours": 3.5,
        "sleep_duration_hours": 7.0,
        "stress_level": 4,
        "sleep_quality": 8,
        "physical_activity_hours_per_week": 5,
    },
    {
        "daily_screen_time_hours": 8.0,
        "sleep_duration_hours": 4.5,
        "stress_level": 9,
        "sleep_quality": 3,
        "physical_activity_hours_per_week": 1,
    },
    {
        "daily_screen_time_hours": 5.0,
        "sleep_duration_hours": 6.0,
        "stress_level": 5,
        "sleep_quality": 6,
        "physical_activity_hours_per_week": 3,
    },
    {
        "daily_screen_time_hours": 0,
        "sleep_duration_hours": 10,
        "stress_level": 0,
        "sleep_quality": 9,
        "physical_activity_hours_per_week": 8,
    },
    {
        "daily_screen_time_hours": 12,
        "sleep_duration_hours": 4.0,
        "stress_level": 9,
        "sleep_quality": 3,
        "physical_activity_hours_per_week": 1,
    }
]

df = pd.DataFrame(test_data)

# -------------------------------
# 3. Feature Engineering (same as training)
# -------------------------------
df["screen_sleep_ratio"] = df["daily_screen_time_hours"] / (df["sleep_duration_hours"] + 1)
df["stress_x_sleep"] = df["stress_level"] * df["sleep_quality"]
df["activity_balance"] = df["physical_activity_hours_per_week"] / (df["daily_screen_time_hours"] + 1)
df["wellness_score"] = (
    (df["sleep_quality"] + df["physical_activity_hours_per_week"]) / 2
    - (df["stress_level"] + df["daily_screen_time_hours"] / 2)
)

X_reg = df[reg_features]
X_reg_scaled = scaler_reg.transform(X_reg)

# -------------------------------
# 4. Predict Mood Rating
# -------------------------------
predictions = reg_model.predict(X_reg_scaled)

# -------------------------------
# 5. Display Results
# -------------------------------
for i, sample in enumerate(test_data):
    print("\n-------------------------------")
    print(f"Sample #{i + 1}")
    for key, value in sample.items():
        print(f"{key:35s}: {value}")
    print(f"Predicted Mood Rating (0–10)   : {predictions[i]:.2f}")
    if predictions[i] >= 7:
        print("Mood Status                    : 😊 Positive Mood")
    elif predictions[i] >= 4:
        print("Mood Status                    : 😐 Neutral Mood")
    else:
        print("Mood Status                    : 😞 Low Mood")
print("\n✅ Mood prediction testing complete!")


Loading trained artifacts...
✅ Loaded regression features: ['daily_screen_time_hours', 'sleep_duration_hours', 'stress_level', 'sleep_quality', 'physical_activity_hours_per_week', 'screen_sleep_ratio', 'stress_x_sleep', 'activity_balance', 'wellness_score']

-------------------------------
Sample #1
daily_screen_time_hours            : 3.5
sleep_duration_hours               : 7.0
stress_level                       : 4
sleep_quality                      : 8
physical_activity_hours_per_week   : 5
Predicted Mood Rating (0–10)   : 5.66
Mood Status                    : 😐 Neutral Mood

-------------------------------
Sample #2
daily_screen_time_hours            : 8.0
sleep_duration_hours               : 4.5
stress_level                       : 9
sleep_quality                      : 3
physical_activity_hours_per_week   : 1
Predicted Mood Rating (0–10)   : 4.84
Mood Status                    : 😐 Neutral Mood

-------------------------------
Sample #3
daily_screen_time_hours            : 5.0
sl