In [5]:
# Second attempt: compact pipeline, saving results.
import pandas as pd, numpy as np, os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance
import pickle

file1 = "Stress_Dataset.csv"
file2 = "StressLevelDataset.csv"

def load_csv(path):
    return pd.read_csv(path)

df1 = load_csv(file1)
df2 = load_csv(file2)

def clean_cols(df):
    df = df.copy()
    df.columns = [str(c).strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_") for c in df.columns]
    return df

df1 = clean_cols(df1); df2 = clean_cols(df2)

target_candidates = [c for c in set(df1.columns).union(df2.columns) if "stress" in c]
if not target_candidates:
    raise SystemExit("No target-like columns found.")
target_col = None
for c in target_candidates:
    sample_vals = pd.concat([df1.get(c, pd.Series(dtype=object)), df2.get(c, pd.Series(dtype=object))]).dropna().astype(str).str.lower().unique()
    if any(x in " ".join(sample_vals) for x in ["eustress", "distress", "no stress", "no_stress", "nostress"]):
        target_col = c; break
if target_col is None:
    target_col = target_candidates[0]
print("Using target column:", target_col)

likert_map = {"not at all":1,"slightly":2,"moderately":3,"very":4,"extremely":5,"no":1,"yes":5}
def map_likert_series(s):
    s = s.astype(str).str.strip()
    numeric = pd.to_numeric(s, errors="coerce")
    mapped = s.str.lower().map(likert_map)
    out = numeric.fillna(mapped)
    out = out.fillna(s.str.extract(r'(\d)').astype(float).iloc[:,0])
    return out

def preprocess_df(df):
    df = df.copy()
    for col in df.columns:
        if col == target_col: continue
        if df[col].dtype == object:
            sample = df[col].dropna().astype(str).str.lower().unique()[:20]
            joined = " ".join(sample)
            if any(k in joined for k in ["not at all", "slightly", "moderately", "very", "extremely"]):
                df[col] = map_likert_series(df[col])
            else:
                df[col] = pd.to_numeric(df[col], errors="ignore")
    return df

df1 = preprocess_df(df1); df2 = preprocess_df(df2)
df = pd.concat([df1, df2], ignore_index=True, sort=False)
df[target_col] = df[target_col].astype(str).str.strip().str.lower().replace({"no stress":"no_stress","nostress":"no_stress","no-stress":"no_stress"})
df = df.dropna(subset=[target_col]).reset_index(drop=True)

thresh = int(0.5*len(df))
drop_cols = [c for c in df.columns if df[c].isna().sum()>thresh]
df = df.drop(columns=drop_cols)

if "gender" in df.columns:
    if df["gender"].dtype==object:
        df["gender"] = df["gender"].astype(str).str.lower().map({"male":0,"m":0,"female":1,"f":1}).astype(float)

features = [c for c in df.columns if c != target_col]
X = df[features].copy()
y = df[target_col].astype(str).copy()
le = LabelEncoder(); y_enc = le.fit_transform(y)

for c in X.columns:
    if X[c].dtype == object or X[c].dtype.name.startswith("str"):
        if X[c].nunique(dropna=True) <= 6:
            X[c] = X[c].astype(str).fillna("missing")
            X[c] = LabelEncoder().fit_transform(X[c])
        else:
            X = X.drop(columns=[c])

imp = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imp.fit_transform(X), columns=X.columns)

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_imp, y_enc)
perm = permutation_importance(rf, X_imp, y_enc, n_repeats=10, random_state=42, n_jobs=-1)
perm_importances = pd.Series(perm.importances_mean, index=X_imp.columns).sort_values(ascending=False)
top_n = 15
print("\nTop permutation importances:")
print(perm_importances.head(top_n).to_string())

selected = perm_importances.head(10).index.tolist()
X_sel = X_imp[selected]

pipe_rf = Pipeline([("scaler", StandardScaler()), ("rf", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_scores = cross_val_score(pipe_rf, X_sel, y_enc, cv=cv, scoring="f1_macro", n_jobs=-1)
print("\n5-fold CV F1-macro scores (RandomForest):", np.round(rf_scores,3), "mean:", np.round(rf_scores.mean(),3))

X_train, X_test, y_train, y_test = train_test_split(X_sel, y_enc, test_size=0.2, stratify=y_enc, random_state=42)
pipe_rf.fit(X_train, y_train)
y_pred = pipe_rf.predict(X_test)
print("\nHoldout Accuracy:", round(accuracy_score(y_test, y_pred),3))
print("Holdout F1-macro:", round(f1_score(y_test, y_pred, average="macro"),3))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix (rows=actual, cols=predicted):\n", cm)

out_path = "combined_stress_cleaned.csv"
df.to_csv(out_path, index=False)
print("\nCleaned combined dataset saved to:", out_path)

model_path = "random_forest_stress.pkl"
with open(model_path, "wb") as f:
    pickle.dump(rf, f)

model_path

Using target column: which_type_of_stress_do_you_primarily_experience?

Top permutation importances:
anxiety_level                   0.0
basic_needs                     0.0
bullying                        0.0
extracurricular_activities      0.0
peer_pressure                   0.0
social_support                  0.0
future_career_concerns          0.0
teacher_student_relationship    0.0
study_load                      0.0
academic_performance            0.0
safety                          0.0
self_esteem                     0.0
living_conditions               0.0
noise_level                     0.0
breathing_problem               0.0

5-fold CV F1-macro scores (RandomForest): [0.488 0.488 0.488 0.488 0.488] mean: 0.488

Holdout Accuracy: 0.961
Holdout F1-macro: 0.488

Classification report:
                                                                                  precision    recall  f1-score   support

distress (negative stress) - stress that causes anxiety and impairs well-bei

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


'random_forest_stress.pkl'

In [7]:
import pandas as pd

df = pd.read_csv("combined_stress_cleaned.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1943 entries, 0 to 1942
Data columns (total 22 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   which_type_of_stress_do_you_primarily_experience?  843 non-null    object 
 1   anxiety_level                                      1100 non-null   float64
 2   self_esteem                                        1100 non-null   float64
 3   mental_health_history                              1100 non-null   float64
 4   depression                                         1100 non-null   float64
 5   headache                                           1100 non-null   float64
 6   blood_pressure                                     1100 non-null   float64
 7   sleep_quality                                      1100 non-null   float64
 8   breathing_problem                                  1100 non-null   float64
 9   noise_le

Unnamed: 0,which_type_of_stress_do_you_primarily_experience?,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,...,basic_needs,academic_performance,study_load,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level
0,eustress (positive stress) - stress that motiv...,,,,,,,,,,...,,,,,,,,,,
1,eustress (positive stress) - stress that motiv...,,,,,,,,,,...,,,,,,,,,,
2,eustress (positive stress) - stress that motiv...,,,,,,,,,,...,,,,,,,,,,
3,eustress (positive stress) - stress that motiv...,,,,,,,,,,...,,,,,,,,,,
4,eustress (positive stress) - stress that motiv...,,,,,,,,,,...,,,,,,,,,,
