In [4]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # pip install xgboost
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, matthews_corrcoef, roc_auc_score)

# 1. Load the dataset
df = pd.read_csv("student_health_data.csv")

# 2. Preprocessing
# Drop ID and empty columns
cols_to_drop = [col for col in df.columns if 'Unnamed' in col] + ['Student_ID']
df = df.drop(columns=cols_to_drop)

# Encode Categorical Features
categorical_cols = ['Gender', 'Physical_Activity', 'Sleep_Quality', 'Mood']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Encode Target
le_target = LabelEncoder()
df['Health_Risk_Level'] = le_target.fit_transform(df['Health_Risk_Level'])

# 3. Features and Target
X = df.drop('Health_Risk_Level', axis=1)
y = df['Health_Risk_Level']

# 4. Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Define 6 Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# 6. Train and Calculate Metrics
results_list = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)

    # Metrics calculation
    metrics = {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted'),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1": f1_score(y_test, y_pred, average='weighted'),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    results_list.append(metrics)
    
    # Save model files
    with open(f"{name.lower().replace(' ', '_')}.pkl", 'wb') as f:
        pickle.dump(model, f)

# 7. Save auxiliary files
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('le_target.pkl', 'wb') as f:
    pickle.dump(le_target, f)

# 8. Display Comparison Table
comparison_df = pd.DataFrame(results_list)
print(comparison_df.to_string(index=False))

      ML Model Name  Accuracy      AUC  Precision  Recall       F1      MCC
Logistic Regression     0.825 0.913797   0.834747   0.825 0.812955 0.672413
      Decision Tree     0.985 0.984366   0.985082   0.985 0.985004 0.972841
                kNN     0.705 0.827817   0.726460   0.705 0.658569 0.414926
        Naive Bayes     0.775 0.957640   0.803858   0.775 0.742775 0.578414
      Random Forest     0.960 0.999904   0.962481   0.960 0.958980 0.928183
            XGBoost     1.000 1.000000   1.000000   1.000 1.000000 1.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
