In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
import numpy as np

#LOAD & PREPROCESS → SPLIT → SAVE
df = pd.read_csv("Mental_Stress_and_Coping_Mechanisms_processed_final.csv")


#loading the cleaned & feature-engineered CSV
dummy_cols = [
    "Stress Level Category_Low",
    "Stress Level Category_Medium",
    "Stress Level Category_High"
]
df['Stress Level Category'] = (
    df[dummy_cols]
      .idxmax(axis=1)  # picks the dummy with a 1
      .str.replace("Stress Level Category_", "", regex=False)
)

#NOW DROP those dummies and any other unwanted columns
cols_to_drop = [
    "Mental Stress Level",
    *dummy_cols,
    "Stress Coping Mechanisms",
    "Unnamed: 0"
]
df = df.drop(columns=cols_to_drop, errors="ignore")

#ENGINEER FEATURES (unchanged)
df['Gender_Other'] = (
    df[['Gender_Agender','Gender_Bigender','Gender_Genderfluid']]
      .sum(axis=1).clip(upper=1)
)
df["Stress_Ratio"] = df["Financial Stress"] / (df["Family Support"] + 1e-5)
df['Social_Media_Usage_per_week'] = df['Social Media Usage (Hours per day)'] * 7

#SELECT FEATURES & TARGET
selected_features = [
    'Age','Academic Performance (GPA)','Study Hours Per Week',
    'Social_Media_Usage_per_week','Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)','Family Support','Financial Stress',
    'Peer Pressure','Relationship Stress','Counseling Attendance','Diet Quality',
    'Cognitive Distortions','Family Mental Health History','Medical Condition',
    'Substance Use','Gender_Female','Gender_Male','Gender_Other','Stress_Ratio'
]
X = df[selected_features]
y = df['Stress Level Category']

#OPTIONAL OUTLIER REMOVAL
Q1, Q3 = X['Study Hours Per Week'].quantile([0.25,0.75])
IQR = Q3 - Q1
mask = X['Study Hours Per Week'].between(Q1-1.5*IQR, Q3+1.5*IQR)
X, y = X[mask], y[mask]

#SPLIT & SAVE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

train_df = X_train.copy()
train_df['Stress Level Category'] = y_train
test_df  = X_test.copy()
test_df ['Stress Level Category'] = y_test

train_df.to_csv("train_data.csv", index=False)
test_df .to_csv("test_data.csv",  index=False)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer


train_df = pd.read_csv("train_data.csv")

#EXTRACT FEATURES & TARGET
#(Assumes 'Stress Level Category' is your target column)
X_train_raw = train_df.drop(columns=["Stress Level Category"])
y_train_raw = train_df["Stress Level Category"]

#LABEL‐MAP YOUR TARGET (optional, for consistent ordering)
label_map = {'Low':0, 'Medium':1, 'High':2}
y_train = y_train_raw.map(label_map).values

#STANDARDIZE FEATURES
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)

imputer = SimpleImputer(strategy="median")
X_train = imputer.fit_transform(X_train)
X_test  = imputer.transform(X_test)

#persist it so inference can load it later
joblib.dump(imputer, "imputer.joblib")
print("✅ Fitted & saved imputer.joblib")

#BALANCE WITH SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



✅ Fitted & saved imputer.joblib


In [None]:
#Step 8+: Train tuned RandomForest on train_data.csv, evaluate & save artifacts

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib, json

#reloading the split data you wrote out earlier
train = pd.read_csv("train_data.csv")
test  = pd.read_csv("test_data.csv")

#extracting X/y and encode target into 0/1/2
feature_columns = [   # must match exactly what you persisted in feature_columns.json
    'Age','Academic Performance (GPA)','Study Hours Per Week',
    'Social_Media_Usage_per_week','Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)','Family Support','Financial Stress',
    'Peer Pressure','Relationship Stress','Counseling Attendance','Diet Quality',
    'Cognitive Distortions','Family Mental Health History','Medical Condition',
    'Substance Use','Gender_Female','Gender_Male','Gender_Other','Stress_Ratio'
]
label_map = {'Low':0, 'Medium':1, 'High':2}

X_train = train[feature_columns].values
y_train = train["Stress Level Category"].map(label_map).values

X_test  = test[feature_columns].values
y_test  = test["Stress Level Category"].map(label_map).values

#Scale features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

#Balance with SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_s, y_train)

#Train the tuned RandomForest (your best hyperparameters here)
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42
)
#evaluating via 5-fold CV on the resampled train
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X_train_res, y_train_res, cv=kf, scoring='accuracy')
print(f"✅ RF Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"🔁 Fold-wise: {np.round(cv_scores,4)}")

#fit on the entire resampled training set
rf.fit(X_train_res, y_train_res)

#Evaluate on your held-out test set
y_pred_test = rf.predict(X_test_s)
print("\n=== Hold-out Test Classification ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(classification_report(y_test, y_pred_test, target_names=['Low','Medium','High']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

#Persist all inference artifacts
joblib.dump(scaler,    "scaler.joblib")
joblib.dump(rf,        "rf_model.joblib")
with open("label_map.json","w") as f:
    json.dump(label_map, f)
with open("feature_columns.json","w") as f:
    json.dump(feature_columns, f)

print("✅ Saved scaler.joblib, rf_model.joblib, label_map.json, feature_columns.json")


✅ RF Mean CV Accuracy: 0.5102
🔁 Fold-wise: [0.4989 0.4905 0.5206 0.5185 0.5227]

=== Hold-out Test Classification ===
Accuracy: 0.3462
              precision    recall  f1-score   support

         Low       0.28      0.20      0.24       517
      Medium       0.29      0.21      0.25       523
        High       0.39      0.56      0.46       676

    accuracy                           0.35      1716
   macro avg       0.32      0.32      0.31      1716
weighted avg       0.33      0.35      0.33      1716

Confusion Matrix:
 [[104 112 301]
 [128 112 283]
 [133 165 378]]
✅ Saved scaler.joblib, rf_model.joblib, label_map.json, feature_columns.json


In [None]:
#existing imports
from sklearn.impute import SimpleImputer
import joblib
import json


#persist the StandardScaler
joblib.dump(scaler,     "scaler.joblib")

#persist the median‐imputer
joblib.dump(imputer,    "imputer.joblib")

#persist the tuned SVM
joblib.dump(rf, "rf_model.joblib")

#persist the label‐map (int→string)
label_map = {'Low':0, 'Medium':1, 'High':2}
inv_map   = {str(v): k for k,v in label_map.items()}
with open("label_map.json","w") as f:
    json.dump(inv_map, f)


feature_columns = [
    'Age',
    'Academic Performance (GPA)',
    'Study Hours Per Week',
    'Social_Media_Usage_per_week',
    'Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)',
    'Family Support',
    'Financial Stress',
    'Peer Pressure',
    'Relationship Stress',
    'Counseling Attendance',
    'Diet Quality',
    'Cognitive Distortions',
    'Family Mental Health History',
    'Medical Condition',
    'Substance Use',
    'Gender_Female',
    'Gender_Male',
    'Gender_Other',
    'Stress_Ratio'
]

with open("feature_columns.json","w") as f:
    json.dump(feature_columns, f)

print("✅ Persisted scaler.joblib, svm_model.joblib, label_map.json, feature_columns.json")


✅ Persisted scaler.joblib, svm_model.joblib, label_map.json, feature_columns.json


In [None]:
#Preprocessing and splitting data for Recommendation Model
#new cell in ML Proj-Class Pipeline.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split

#load the “processed_final” which has ID + coping
df = pd.read_csv("Mental_Stress_and_Coping_Mechanisms_processed_final.csv")

#reconstruct target if needed
if "Stress Level Category" not in df:
    dummies = [c for c in df.columns if c.startswith("Stress Level Category_")]
    df["Stress Level Category"] = (
        df[dummies].idxmax(axis=1)
                 .str.replace("Stress Level Category_","",regex=False)
    )
    df.drop(columns=dummies, inplace=True)

#drop truly unused columns
to_drop = ["Mental Stress Level","Unnamed: 0"]
df.drop(columns=[c for c in to_drop if c in df.columns], inplace=True)

#feature‐engineer as before
df["Gender_Other"] = (
    df[['Gender_Agender','Gender_Bigender','Gender_Genderfluid']]
      .sum(axis=1).clip(upper=1)
)
df["Stress_Ratio"] = df["Financial Stress"]/(df["Family Support"]+1e-5)
df["Social_Media_Usage_per_week"] = df["Social Media Usage (Hours per day)"]*7

#select the same features list you used for training
selected_features = [
    'Age','Academic Performance (GPA)','Study Hours Per Week',
    'Social_Media_Usage_per_week','Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)','Family Support','Financial Stress',
    'Peer Pressure','Relationship Stress','Counseling Attendance','Diet Quality',
    'Cognitive Distortions','Family Mental Health History','Medical Condition',
    'Substance Use','Gender_Female','Gender_Male','Gender_Other','Stress_Ratio'
]

#split, keeping ID & coping
train_df, test_df = train_test_split(
    df[ ["Student_id","Stress Coping Mechanisms","Stress Level Category"] + selected_features ],
    test_size=0.30,
    stratify=df["Stress Level Category"],
    random_state=42
)

#write out
train_df.to_csv("train_recs.csv", index=False)
test_df .to_csv("test_recs.csv",  index=False)

print("✅ Wrote train_recs.csv and test_recs.csv with ID + coping + features + target")


✅ Wrote train_recs.csv and test_recs.csv with ID + coping + features + target
