<a href="https://colab.research.google.com/github/2303A51125/ADM_lab/blob/main/projectcode01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

INPATH = "/content/academic Stress level - maintainance 1 (1).csv"
RANDOM_STATE = 42

def load_and_clean(path):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    return df

def basic_preprocess(df):
    df = df.dropna(axis=1, how="all")
    target_col = None
    for candidate in ['stress','Stress','stress_level','StressLevel','Stress_Level','target','label']:
        if candidate in df.columns:
            target_col = candidate
            break
    if target_col is None:
        for col in df.columns:
            if df[col].nunique() <= 5:
                target_col = col
                break
    if target_col is None:
        raise ValueError("Unable to identify target column automatically.")
    X = df.drop(columns=[target_col]).copy()
    y_raw = df[target_col].copy()
    for c in X.select_dtypes(include=['object']).columns:
        X[c] = X[c].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan})
    if y_raw.dtype == 'O' or not np.issubdtype(y_raw.dtype, np.number):
        le = LabelEncoder()
        y = le.fit_transform(y_raw.astype(str))
    else:
        if y_raw.nunique() > 6:
            y = pd.qcut(y_raw, q=3, labels=False, duplicates='drop')
            y = y.astype(int).values
        else:
            y = y_raw.astype(int).values
    return X, y, target_col

def feature_engineer(X):
    X2 = X.copy()
    for c in X2.columns:
        if X2[c].dtype == 'O':
            cleaned = X2[c].astype(str).str.replace(r'[^0-9.\-]', '', regex=True)
            nonempty = cleaned.replace('', np.nan).dropna()
            try:
                conv = pd.to_numeric(nonempty, errors='coerce')
                pct_numeric = conv.notna().sum() / max(1, len(nonempty))
            except Exception:
                pct_numeric = 0
            if pct_numeric > 0.6:
                X2[c] = pd.to_numeric(cleaned, errors='coerce')
    if 'study_hours' in X2.columns and 'sleep_hours' in X2.columns:
        X2['study_sleep_ratio'] = X2['study_hours'] / (X2['sleep_hours'] + 1e-6)
    if 'assignments' in X2.columns and 'study_hours' in X2.columns:
        X2['assign_per_study'] = X2['assignments'] / (X2['study_hours'] + 1e-6)
    return X2

def prepare_pipeline_and_split(X, y, test_size=0.2):
    X_proc = X.copy()
    low_card_cols = [c for c in X_proc.select_dtypes(include=['object','category']).columns if X_proc[c].nunique() <= 20]
    if low_card_cols:
        X_proc = pd.get_dummies(X_proc, columns=low_card_cols, dummy_na=True)
    high_card_cols = [c for c in X_proc.select_dtypes(include=['object','category']).columns if c not in low_card_cols]
    X_proc = X_proc.drop(columns=high_card_cols)
    for c in X_proc.columns:
        if X_proc[c].dtype.kind in 'biufc':
            X_proc[c] = X_proc[c].fillna(X_proc[c].median())
        else:
            X_proc[c] = X_proc[c].fillna(0)
    X_matrix = X_proc.values.astype(float)
    X_train, X_test, y_train, y_test = train_test_split(X_matrix, y, test_size=test_size, stratify=y, random_state=RANDOM_STATE)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, X_proc.columns.tolist()

def run_models(X_train, X_test, y_train, y_test):
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
        "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=200, random_state=RANDOM_STATE),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE),
        "AdaBoost": AdaBoostClassifier(n_estimators=200, random_state=RANDOM_STATE),
        "KNN": KNeighborsClassifier(n_neighbors=7),
        "SVM": SVC(kernel='rbf', probability=False, random_state=RANDOM_STATE),
        "NaiveBayes": GaussianNB()
    }
    results = {}
    reports = {}
    cms = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        results[name] = acc
        reports[name] = classification_report(y_test, preds, output_dict=True)
        cms[name] = confusion_matrix(y_test, preds)
        print("\n=== Model:", name, "===")
        print("Accuracy:", acc)
        print(classification_report(y_test, preds, digits=4))
    try:
        from xgboost import XGBClassifier
        xgb = XGBClassifier(n_estimators=300, use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE)
        xgb.fit(X_train, y_train)
        preds = xgb.predict(X_test)
        acc = accuracy_score(y_test, preds)
        results['XGBoost'] = acc
        reports['XGBoost'] = classification_report(y_test, preds, output_dict=True)
        cms['XGBoost'] = confusion_matrix(y_test, preds)
        print("\n=== Model: XGBoost ===")
        print("Accuracy:", acc)
        print(classification_report(y_test, preds, digits=4))
    except Exception:
        pass
    return results, reports, cms

def main():
    df = load_and_clean(INPATH)
    X, y, target_col = basic_preprocess(df)
    X_fe = feature_engineer(X)
    X_train, X_test, y_train, y_test, scaler, feature_names = prepare_pipeline_and_split(X_fe, y)
    results, reports, cms = run_models(X_train, X_test, y_train, y_test)
    print("\n=== Summary Accuracies ===")
    for k, v in sorted(results.items(), key=lambda kv: kv[1], reverse=True):
        print(f"{k:15s} : {v:.4f}")
    out_dir = "supervised_results"
    os.makedirs(out_dir, exist_ok=True)
    pd.DataFrame.from_dict(results, orient='index', columns=['accuracy']).to_csv(os.path.join(out_dir, "accuracy_summary.csv"))
    for model_name, rep in reports.items():
        rep_df = pd.DataFrame(rep).transpose()
        rep_df.to_csv(os.path.join(out_dir, f"classification_report_{model_name}.csv"))
    for model_name, cm in cms.items():
        cm_df = pd.DataFrame(cm)
        cm_df.to_csv(os.path.join(out_dir, f"confusion_matrix_{model_name}.csv"))
    print("\nSaved results under folder:", out_dir)

if __name__ == "__main__":
    main()



=== Model: LogisticRegression ===
Accuracy: 0.6428571428571429
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         6
           1     0.0000    0.0000    0.0000         2
           2     0.7200    0.9000    0.8000        20

    accuracy                         0.6429        28
   macro avg     0.2400    0.3000    0.2667        28
weighted avg     0.5143    0.6429    0.5714        28


=== Model: DecisionTree ===
Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           0     0.6667    0.6667    0.6667         6
           1     0.0000    0.0000    0.0000         2
           2     0.8000    0.8000    0.8000        20

    accuracy                         0.7143        28
   macro avg     0.4889    0.4889    0.4889        28
weighted avg     0.7143    0.7143    0.7143        28


=== Model: RandomForest ===
Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

   