# ML Assignment 2 - Model Training

Train all 6 classification models and save them along with evaluation metrics.

**Author:** Abhishek Anand (2024DC04179)


## 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef, confusion_matrix
)
import joblib
import os


## 2. Setup and Load Data


In [None]:
# Create model directory if it doesn't exist
script_dir = os.path.dirname(os.path.abspath(''))
project_root = os.path.dirname(script_dir) if 'model' in script_dir else script_dir
saved_models_dir = os.path.join(project_root, 'saved_models')
os.makedirs(saved_models_dir, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Saved models directory: {saved_models_dir}")


In [None]:
# Load dataset
possible_paths = [
    'data/Student_performance_data.csv',
    '../data/Student_performance_data.csv',
    os.path.join(project_root, 'data', 'Student_performance_data.csv'),
    os.path.join(project_root, 'data', 'Student_Performance_data.csv')
]

df = None
for path in possible_paths:
    if os.path.exists(path):
        df = pd.read_csv(path)
        print(f"Dataset loaded from: {path}")
        break

if df is None:
    raise FileNotFoundError(f"Dataset not found. Tried paths: {possible_paths}")

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('GradeClass', axis=1)
y = df['GradeClass']

# Check for categorical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

if categorical_cols:
    print(f"Encoding categorical features: {categorical_cols}")
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")
print(f"Target classes: {sorted(y.unique())}")


In [None]:
# Scale features (important for Logistic Regression, KNN, and Naive Bayes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for later use
scaler_path = os.path.join(saved_models_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")


## 4. Define Metrics Calculation Function


In [None]:
def calculate_metrics(y_true, y_pred, y_pred_proba):
    """Calculate all evaluation metrics for multiclass classification"""
    if y_pred_proba.ndim == 1:
        y_pred_proba = np.column_stack([1 - y_pred_proba, y_pred_proba])
    
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'auc': roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro'),
        'precision': precision_score(y_true, y_pred, average='macro', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='macro', zero_division=0),
        'f1': f1_score(y_true, y_pred, average='macro', zero_division=0),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }
    return metrics


## 5. Train Logistic Regression


In [None]:
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000, multi_class='ovr')
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)

lr_metrics = calculate_metrics(y_test, y_pred_lr, y_pred_proba_lr)
print(f"Accuracy: {lr_metrics['accuracy']:.4f}")
print(f"AUC: {lr_metrics['auc']:.4f}")

joblib.dump(lr_model, os.path.join(saved_models_dir, 'logistic_regression.pkl'))
print("Model saved!")


## 6. Train Decision Tree

In [None]:
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_pred_proba_dt = dt_model.predict_proba(X_test)

dt_metrics = calculate_metrics(y_test, y_pred_dt, y_pred_proba_dt)
print(f"Accuracy: {dt_metrics['accuracy']:.4f}")
print(f"AUC: {dt_metrics['auc']:.4f}")

joblib.dump(dt_model, os.path.join(saved_models_dir, 'decision_tree.pkl'))
print("Model saved!")


## 7. Train K-Nearest Neighbors


In [None]:
print("Training KNN...")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

y_pred_knn = knn_model.predict(X_test_scaled)
y_pred_proba_knn = knn_model.predict_proba(X_test_scaled)

knn_metrics = calculate_metrics(y_test, y_pred_knn, y_pred_proba_knn)
print(f"Accuracy: {knn_metrics['accuracy']:.4f}")
print(f"AUC: {knn_metrics['auc']:.4f}")

joblib.dump(knn_model, os.path.join(saved_models_dir, 'knn.pkl'))
print("Model saved!")


## 8. Train Naive Bayes

In [None]:
print("Training Naive Bayes...")
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred_nb = nb_model.predict(X_test_scaled)
y_pred_proba_nb = nb_model.predict_proba(X_test_scaled)

nb_metrics = calculate_metrics(y_test, y_pred_nb, y_pred_proba_nb)
print(f"Accuracy: {nb_metrics['accuracy']:.4f}")
print(f"AUC: {nb_metrics['auc']:.4f}")

joblib.dump(nb_model, os.path.join(saved_models_dir, 'naive_bayes.pkl'))
print("Model saved!")


## 9. Train Random Forest

In [None]:
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)

rf_metrics = calculate_metrics(y_test, y_pred_rf, y_pred_proba_rf)
print(f"Accuracy: {rf_metrics['accuracy']:.4f}")
print(f"AUC: {rf_metrics['auc']:.4f}")

joblib.dump(rf_model, os.path.join(saved_models_dir, 'random_forest.pkl'))
print("Model saved!")


## 10. Train XGBoost



In [None]:
print("Training XGBoost...")
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)

xgb_metrics = calculate_metrics(y_test, y_pred_xgb, y_pred_proba_xgb)
print(f"Accuracy: {xgb_metrics['accuracy']:.4f}")
print(f"AUC: {xgb_metrics['auc']:.4f}")

joblib.dump(xgb_model, os.path.join(saved_models_dir, 'xgboost.pkl'))
print("Model saved!")


## 11. Results Summary


In [None]:
# Compile all results
results = {
    'Logistic Regression': lr_metrics,
    'Decision Tree': dt_metrics,
    'kNN': knn_metrics,
    'Naive Bayes': nb_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics
}

results_df = pd.DataFrame(results).T
results_df.columns = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
results_df.index.name = 'Model'

print("="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)
print(results_df.round(4))

# Find best model
best_model = results_df['Accuracy'].idxmax()
best_accuracy = results_df['Accuracy'].max()
print(f"\nBest Model: {best_model} (Accuracy: {best_accuracy:.4f})")


## 12. Save Results to Excel


In [None]:
# Save results to Excel
results_path = os.path.join(project_root, 'ML_Assignment_2_Results.xlsx')
results_df.to_excel(results_path, index=True)
print(f"Results saved to {results_path}")
print("\nTraining complete! All models saved to saved_models/ directory.")
