In [29]:
import pandas as pd

# Load dataset
data = pd.read_csv("student_health_data.csv")

print(data.shape)  
print(data.head()) 

(1000, 23)
   Student_ID  Age Gender  Heart_Rate  Blood_Pressure_Systolic  \
0           1   24      M   50.663217               122.173015   
1           2   21      F   57.926042               110.778407   
2           3   22      M   59.294219               109.375673   
3           4   24      M   76.826232               125.142227   
4           5   20      M   68.342769               107.515592   

   Blood_Pressure_Diastolic  Stress_Level_Biosensor  Stress_Level_Self_Report  \
0                 84.419860                3.137350                  9.028669   
1                 75.696145                3.699078                  5.819697   
2                 83.803814                6.785156                  5.892360   
3                 78.091587                6.408509                  6.884001   
4                 80.674937                7.264719                  4.483450   

  Physical_Activity Sleep_Quality  ... Health_Risk_Level  Family_members  \
0              High      Mode

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop ID and empty columns
data = data.drop(columns=["Student_ID"], errors="ignore")
data = data.dropna(axis=1, how="all")  # remove empty columns

# Define target
y = data["Health_Risk_Level"]
X = data.drop("Health_Risk_Level", axis=1)

# Encode target
le = LabelEncoder()
y = le.fit_transform(y)

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Handle missing values
X = X.fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss")
}

In [34]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import label_binarize
import numpy as np

# Binarize target for multi-class AUC
classes = np.unique(y_test)
y_test_bin = label_binarize(y_test, classes=classes)

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # If model supports probability, use it for AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr")
    else:
        auc = None  # Some models may not support probability
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC_SCORE": auc,
        "Precision": precision_score(y_test, y_pred, average="macro"),
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1": f1_score(y_test, y_pred, average="macro"),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  AUC_SCORE  Precision    Recall        F1  \
0  Logistic Regression     0.795   0.935668   0.776353  0.686104  0.717568   
1        Decision Tree     0.975   0.970021   0.978136  0.958903  0.968108   
2                  KNN     0.685   0.794681   0.704873  0.492780  0.519411   
3          Naive Bayes     0.735   0.946840   0.703292  0.557053  0.583288   
4        Random Forest     0.930   0.997633   0.960415  0.860816  0.895331   
5              XGBoost     1.000   1.000000   1.000000  1.000000  1.000000   

        MCC  
0  0.611060  
1  0.954656  
2  0.364098  
3  0.483640  
4  0.874801  
5  1.000000  


In [35]:
import joblib
import os

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    joblib.dump(model, f"model/{name.replace(' ', '_').lower()}.pkl")
