In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
import numpy as np

# 1. LOAD & PREPROCESS → SPLIT → SAVE
# --------------------------------------------------
df = pd.read_csv("Mental_Stress_and_Coping_Mechanisms_processed_final.csv")


# Load the cleaned & feature-engineered CSV
dummy_cols = [
    "Stress Level Category_Low",
    "Stress Level Category_Medium",
    "Stress Level Category_High"
]
df['Stress Level Category'] = (
    df[dummy_cols]
      .idxmax(axis=1)  # picks the dummy with a 1
      .str.replace("Stress Level Category_", "", regex=False)
)

# 3. NOW DROP those dummies and any other unwanted columns
cols_to_drop = [
    "Mental Stress Level",
    *dummy_cols,
    "Stress Coping Mechanisms",
    "Unnamed: 0"
]
df = df.drop(columns=cols_to_drop, errors="ignore")

# 4. ENGINEER FEATURES (unchanged)
df['Gender_Other'] = (
    df[['Gender_Agender','Gender_Bigender','Gender_Genderfluid']]
      .sum(axis=1).clip(upper=1)
)
df["Stress_Ratio"] = df["Financial Stress"] / (df["Family Support"] + 1e-5)
df['Social_Media_Usage_per_week'] = df['Social Media Usage (Hours per day)'] * 7

# 5. SELECT FEATURES & TARGET
selected_features = [
    'Age','Academic Performance (GPA)','Study Hours Per Week',
    'Social_Media_Usage_per_week','Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)','Family Support','Financial Stress',
    'Peer Pressure','Relationship Stress','Counseling Attendance','Diet Quality',
    'Cognitive Distortions','Family Mental Health History','Medical Condition',
    'Substance Use','Gender_Female','Gender_Male','Gender_Other','Stress_Ratio'
]
X = df[selected_features]
y = df['Stress Level Category']

# 6. OPTIONAL OUTLIER REMOVAL
Q1, Q3 = X['Study Hours Per Week'].quantile([0.25,0.75])
IQR = Q3 - Q1
mask = X['Study Hours Per Week'].between(Q1-1.5*IQR, Q3+1.5*IQR)
X, y = X[mask], y[mask]

# 7. SPLIT & SAVE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

train_df = X_train.copy()
train_df['Stress Level Category'] = y_train
test_df  = X_test.copy()
test_df ['Stress Level Category'] = y_test

train_df.to_csv("train_data.csv", index=False)
test_df .to_csv("test_data.csv",  index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'Mental_Stress_and_Coping_Mechanisms_processed_final.csv'

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC


train_df = pd.read_csv("train_data.csv")

# 2. EXTRACT FEATURES & TARGET
#    (Assumes 'Stress Level Category' is your target column)
X_train_raw = train_df.drop(columns=["Stress Level Category"])
y_train_raw = train_df["Stress Level Category"]

# 3. LABEL‐MAP YOUR TARGET (optional, for consistent ordering)
label_map = {'Low':0, 'Medium':1, 'High':2}
y_train = y_train_raw.map(label_map).values

# 4. STANDARDIZE FEATURES
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)

# 5. BALANCE WITH SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 1. Define the parameter grid for RBF-SVM
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf']
}

# 2. Initialize a base SVM with balanced class weights and probability outputs
svm_base = SVC(
    class_weight='balanced',
    probability=True,
    random_state=42
)

# 3. Wrap it in GridSearchCV
svm_grid_search = GridSearchCV(
    estimator=svm_base,
    param_grid=svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# 4. Fit on the SMOTE‐resampled training data
svm_grid_search.fit(X_train_resampled, y_train_resampled)

# 5. Pull out the best model and its CV score
svm_best = svm_grid_search.best_estimator_
svm_best_params = svm_grid_search.best_params_
svm_best_score = svm_grid_search.best_score_

print("🔧 Best SVM parameters:", svm_best_params)
print("🎯 Best CV accuracy   :", round(svm_best_score, 4))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
🔧 Best SVM parameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
🎯 Best CV accuracy   : 0.5975


In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

# Encode the y labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_resampled)

xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
# Now cross-validation with encoded labels
xgb_cv_scores = cross_val_score(xgb_model, X_train_resampled, y_train_encoded, cv=5, scoring='accuracy')

print("XGBoost Mean CV Accuracy:", np.mean(xgb_cv_scores))
print("XGBoost CV Scores:", xgb_cv_scores)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost Mean CV Accuracy: 0.4587607323147938
XGBoost CV Scores: [0.3871308  0.38080169 0.374868   0.468849   0.68215417]


In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Encode the target
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_resampled)

# Define the XGBoost model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit
grid_search.fit(X_train_resampled, y_train_encoded)

# Results
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.4802937546504841


In [6]:
from sklearn.ensemble import GradientBoostingClassifier
# Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gb_cv_scores = cross_val_score(gb_model, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Output
gb_mean_accuracy = np.mean(gb_cv_scores)
gb_std_accuracy = np.std(gb_cv_scores)

gb_mean_accuracy, gb_std_accuracy, gb_cv_scores

(0.4049053417632408,
 0.01828539153180415,
 array([0.39873418, 0.37447257, 0.40760296, 0.41393875, 0.42977825]))

In [7]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Set up GridSearchCV
gb_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=gb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Perform the grid search on training data
gb_grid_search.fit(X_train_resampled, y_train_resampled)

# Extract best parameters and score
gb_best_params = gb_grid_search.best_params_
gb_best_score = gb_grid_search.best_score_

gb_best_params, gb_best_score

Fitting 5 folds for each of 48 candidates, totalling 240 fits


({'learning_rate': 0.1,
  'max_depth': 7,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 200},
 0.46846893810790463)

In [8]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=gb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

gb_grid_search.fit(X_train_resampled, y_train_resampled)

# Extract best parameters and score
gb_best_params = gb_grid_search.best_params_
gb_best_score = gb_grid_search.best_score_

gb_best_params, gb_best_score

Fitting 5 folds for each of 8 candidates, totalling 40 fits


({'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200},
 0.44334763566046903)

In [9]:
from lightgbm import LGBMClassifier

# Train and evaluate LightGBM without hyperparameter tuning
lgb_default_model = LGBMClassifier(random_state=42)

# Cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgb_default_scores = cross_val_score(lgb_default_model, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Results
lgb_default_mean = np.mean(lgb_default_scores)
lgb_default_std = np.std(lgb_default_scores)

lgb_default_mean, lgb_default_std, lgb_default_scores

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3693
[LightGBM] [Info] Number of data points in the train set: 3789, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3695
[LightGBM] [Info] Number of data points in the train set: 3789, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 secon

(0.4483857529217293,
 0.009893423236654708,
 array([0.43565401, 0.45780591, 0.46145723, 0.44033791, 0.44667371]))

In [10]:
lgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31],
    'min_child_samples': [10, 20]
}

# Setup GridSearchCV
lgb_grid = GridSearchCV(
    estimator=LGBMClassifier(random_state=42),
    param_grid=lgb_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit on training data
lgb_grid.fit(X_train_resampled, y_train_resampled)

# Best parameters and score
lgb_best_params = lgb_grid.best_params_
lgb_best_score = lgb_grid.best_score_

print("✅ Best LightGBM Parameters:", lgb_best_params)
print("✅ Best CV Accuracy:", round(lgb_best_score, 4))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3718
[LightGBM] [Info] Number of data points in the train set: 4737, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
✅ Best LightGBM Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_samples': 10, 'n_estimators': 200, 'num_leaves': 31}
✅ Best CV Accuracy: 0.4539


In [11]:
from catboost import CatBoostClassifier

# Train and evaluate CatBoost without hyperparameter tuning
cat_default_model = CatBoostClassifier(verbose=0, random_state=42)

# Cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cat_default_scores = cross_val_score(cat_default_model, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Results
cat_default_mean = np.mean(cat_default_scores)
cat_default_std = np.std(cat_default_scores)

cat_default_mean, cat_default_std, cat_default_scores

(0.4908215595328797,
 0.014535322618368055,
 array([0.48523207, 0.47468354, 0.51531151, 0.49841605, 0.48046463]))

In [12]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for CatBoost
cat_param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.05, 0.1],
    'depth': [3, 5, 7]
}

# Setup GridSearchCV
cat_grid_search = GridSearchCV(
    estimator=CatBoostClassifier(verbose=0, random_state=42),
    param_grid=cat_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Perform the grid search
cat_grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and score
cat_best_params = cat_grid_search.best_params_
cat_best_score = cat_grid_search.best_score_

cat_best_params, cat_best_score

Fitting 5 folds for each of 12 candidates, totalling 60 fits


({'depth': 7, 'iterations': 200, 'learning_rate': 0.1}, 0.4826046275379947)

In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# 1. Define your three tuned base-learners:
rf_best = RandomForestClassifier(
    max_depth=10,
    n_estimators=200,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)

cat_best = CatBoostClassifier(
    depth=3,
    iterations=100,
    learning_rate=0.05,
    verbose=0,
    random_state=42
)

svm_best = SVC(
    kernel='rbf',
    C=10,
    gamma=1,
    probability=True,    # <-- needed for soft voting
    random_state=42
)

# 2. Build a soft-voting ensemble of all three:
voting_model = VotingClassifier(
    estimators=[
        ('rf',  rf_best),
        ('cat', cat_best),
        ('svm', svm_best)
    ],
    voting='soft'   # average their predicted probabilities
)

# 3. Evaluate with stratified 5-fold CV:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(voting_model, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

print("✅ Voting Ensemble Accuracy:", round(np.mean(scores), 4))
print("Fold-wise scores:", np.round(scores, 4))

✅ Voting Ensemble Accuracy: 0.5858
Fold-wise scores: [0.5665 0.5665 0.5839 0.6114 0.6008]


In [14]:
from sklearn.ensemble import VotingClassifier

# Best tuned models
rf_best = RandomForestClassifier(
    max_depth=10, n_estimators=200, min_samples_split=2,
    min_samples_leaf=2, random_state=42
)

cat_best = CatBoostClassifier(
    depth=3, iterations=100, learning_rate=0.05, verbose=0, random_state=42
)

# Combine them into Voting Classifier
voting_model = VotingClassifier(
    estimators=[('rf', rf_best), ('cat', cat_best)],
    voting='soft'  # 'hard' for majority class, 'soft' for probability average
)

# Evaluate using CV
from sklearn.model_selection import cross_val_score
scores = cross_val_score(voting_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')

print("✅ Voting Classifier Accuracy: ", round(np.mean(scores), 6))
print("Fold-wise:", scores)

✅ Voting Classifier Accuracy:  0.463193
Fold-wise: [0.38818565 0.39135021 0.40126716 0.46356917 0.67159451]


In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

# Initialize HistGradientBoostingClassifier with custom hyperparameters
hist_model = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_iter=100,
    max_leaf_nodes=31,
    l2_regularization=1.0,
    random_state=42
)

# Stratified 5-Fold cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
hist_scores = cross_val_score(hist_model, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Results
mean_accuracy = np.mean(hist_scores)
std_accuracy = np.std(hist_scores)

print(f"✅ Mean CV Accuracy: {mean_accuracy:.4f}")
print(f"✅ Std Dev CV Accuracy: {std_accuracy:.4f}")
print(f"✅ Fold-wise Scores: {np.round(hist_scores, 4)}")

✅ Mean CV Accuracy: 0.4551
✅ Std Dev CV Accuracy: 0.0199
✅ Fold-wise Scores: [0.4188 0.4536 0.4572 0.4731 0.4731]


In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Best tuned models from your results
rf_best = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)

svm_best = SVC(
    C=10,
    gamma=1,
    kernel='rbf',
    probability=True,  # Needed for soft voting
    random_state=42
)

# VotingClassifier setup (soft voting to combine probabilities)
voting_clf = VotingClassifier(
    estimators=[('rf', rf_best), ('svm', svm_best)],
    voting='soft'
)

# Cross-validation with the resampled data
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
voting_scores = cross_val_score(voting_clf, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

# Results
voting_mean_accuracy = np.mean(voting_scores)
voting_std_accuracy = np.std(voting_scores)

voting_mean_accuracy, voting_std_accuracy, voting_scores

(0.5870889194836904,
 0.01892239850284984,
 array([0.56329114, 0.57067511, 0.58394931, 0.61034847, 0.60718057]))

In [37]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_predict
)
from sklearn.metrics import classification_report, confusion_matrix

def svm_grid_search_from_csv(
    file_path: str,
    target_column: str = "Stress Level Category",
    label_map: dict = {'Low':0, 'Medium':1, 'High':2},
    svm_params: dict = None,
    cv: int = 5,
    test_size: float = 0.30,
    random_state: int = 42,
    n_jobs: int = -1,
    verbose: int = 1
):
    """
    1) Load CSV, rebuild y if needed
    2) Drop ID/text, map Yes/No → 0/1
    3) Split into train/test
    4) Build ColumnTransformer: (impute+scale) for nums, (impute+OHE) for cats
    5) Wrap that + SMOTE + SVC into an imblearn Pipeline
    6) GridSearchCV over SVC hyper‐params
    7) Print CV‐train & hold‐out test reports
    8) Return the fitted pipeline + reports
    """
    # 1) Load
    df = pd.read_csv(file_path)

    # 1a) Reconstruct y if only one‐hot cols exist
    if target_column not in df.columns:
        dummies = [c for c in df.columns if c.startswith(f"{target_column}_")]
        df[target_column] = (
            df[dummies].idxmax(axis=1)
                       .str.replace(f"{target_column}_","",regex=False)
        )
        df.drop(columns=dummies, inplace=True)

    # 1b) Drop unusable text/ID cols
    for c in ("Stress Coping Mechanisms","Student ID","Unnamed: 0"):
        if c in df.columns:
            df.drop(columns=c, inplace=True)

    # 1c) Map Yes/No → 0/1
    for yesno in ("Counseling Attendance",
                  "Family Mental Health History",
                  "Medical Condition"):
        if yesno in df:
            df[yesno] = df[yesno].map({"Yes":1,"No":0})

    # 2) X / y
    X = df.drop(columns=[target_column])
    y = df[target_column].map(label_map).astype(int)

    # 3) Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    # 4) Identify numeric vs. categorical
    num_feats = X_train.select_dtypes(include=["int64","float64"]).columns.tolist()
    cat_feats = X_train.select_dtypes(exclude=["int64","float64"]).columns.tolist()

    # 5) Build preprocessing sub‐pipelines
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])
    preproc = ColumnTransformer([
        ("num", num_pipe, num_feats),
        ("cat", cat_pipe, cat_feats)
    ])

    # 6) Full pipeline: preprocessing → SMOTE → SVC
    pipeline = ImbPipeline([
        ("pre", preproc),
        ("smote", SMOTE(random_state=random_state)),
        ("svc", SVC(class_weight="balanced",
                    probability=True,
                    random_state=random_state))
    ])

    # 7) Grid‐search over only the SVC step
    if svm_params is None:
        svm_params = {
            "svc__C":     [0.1, 1, 10, 100],
            "svc__gamma": ["scale", 0.01, 0.1, 1],
            "svc__kernel":["rbf"]
        }

    grid = GridSearchCV(
        pipeline,
        param_grid=svm_params,
        cv=StratifiedKFold(cv, shuffle=True, random_state=random_state),
        scoring="accuracy",
        n_jobs=n_jobs,
        verbose=verbose
    )
    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    print(f"🔧 Best params: {grid.best_params_}")
    print(f"🎯 CV accuracy: {grid.best_score_:.4f}")

    # 8a) Cross‐validated TRAIN report
    y_train_cv = cross_val_predict(
        best, X_train, y_train,
        cv=StratifiedKFold(cv, shuffle=True, random_state=random_state)
    )
    print("\n📊 TRAIN (CV) Report:")
    print(classification_report(y_train, y_train_cv, target_names=list(label_map.keys())))
    print("Confusion (train):")
    print(confusion_matrix(y_train, y_train_cv))

    # 8b) HOLD‐OUT TEST report
    y_test_pred = best.predict(X_test)
    print("\n📉 TEST Report:")
    print(classification_report(y_test, y_test_pred, target_names=list(label_map.keys())))
    print("Confusion (test):")
    print(confusion_matrix(y_test, y_test_pred))

    # 9) Return fitted pipeline + report dicts
    train_rep = classification_report(
        y_train, y_train_cv,
        labels=[0,1,2],
        target_names=list(label_map.keys()),
        output_dict=True
    )
    test_rep  = classification_report(
        y_test, y_test_pred,
        labels=[0,1,2],
        target_names=list(label_map.keys()),
        output_dict=True
    )
    return best, grid.best_params_, train_rep, test_rep


In [39]:
# 1) Fit on train_data.csv
best_model, best_params, train_metrics, test_metrics = \
    svm_grid_search_from_csv("train_data.csv")

# 2) Display what was returned
print("🔧 Best params:", best_params)
print("\n📊 TRAIN (CV) report:\n")
print(pd.DataFrame(train_metrics).T)

print("\n📉 TEST (hold‐out) report:\n")
print(pd.DataFrame(test_metrics).T)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
🔧 Best params: {'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'rbf'}
🎯 CV accuracy: 0.3947

📊 TRAIN (CV) Report:
              precision    recall  f1-score   support

         Low       1.00      0.00      0.00       843
      Medium       0.00      0.00      0.00       854
        High       0.39      1.00      0.57      1105

    accuracy                           0.39      2802
   macro avg       0.46      0.33      0.19      2802
weighted avg       0.46      0.39      0.22      2802

Confusion (train):
[[   1    1  841]
 [   0    0  854]
 [   0    0 1105]]

📉 TEST Report:
              precision    recall  f1-score   support

         Low       1.00      0.01      0.01       362
      Medium       1.00      0.01      0.02       366
        High       0.40      1.00      0.57       474

    accuracy                           0.40      1202
   macro avg       0.80      0.34      0.20      1202
weighted avg       0.76      0

In [41]:
# … your existing imports …
import joblib
import json

# 1) … after you’ve fit & selected best_model and scaler …

# Persist the StandardScaler
joblib.dump(scaler,     "scaler.joblib")

# Persist the tuned SVM
joblib.dump(best_model, "svm_model.joblib")

# Persist the label‐map (int→string)
label_map = {'Low':0, 'Medium':1, 'High':2}
inv_map   = {str(v): k for k,v in label_map.items()}
with open("label_map.json","w") as f:
    json.dump(inv_map, f)

# ────────────────────────────────────────────────────────
# NEW: Persist the *exact* feature order you trained on
# Make sure `feature_columns` is the list of the columns
# you used for training X (in the same order).
# If you built X via `X = df[selected_features]`,
# simply re‐dump that list here:

feature_columns = [
    'Age',
    'Academic Performance (GPA)',
    'Study Hours Per Week',
    'Social_Media_Usage_per_week',
    'Sleep Duration (Hours per night)',
    'Physical Exercise (Hours per week)',
    'Family Support',
    'Financial Stress',
    'Peer Pressure',
    'Relationship Stress',
    'Counseling Attendance',
    'Diet Quality',
    'Cognitive Distortions',
    'Family Mental Health History',
    'Medical Condition',
    'Substance Use',
    'Gender_Female',
    'Gender_Male',
    'Gender_Other',
    'Stress_Ratio'
]

with open("feature_columns.json","w") as f:
    json.dump(feature_columns, f)

print("✅ Persisted scaler.joblib, svm_model.joblib, label_map.json, feature_columns.json")


✅ Persisted scaler.joblib, svm_model.joblib, label_map.json, feature_columns.json
