In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../data/student-mat.csv", sep=";")# MAKE SURE FILE IS IN /data FOLDER


# Create Pass/Fail target column
df["pass_fail"] = df["G3"].apply(lambda x: 1 if x >= 10 else 0)

# Identify categorical variables
categorical = df.select_dtypes(include=["object"]).columns

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical, drop_first=True)

# Drop original grade columns G1 and G2 (optional but common)
X = df_encoded.drop(["G3", "pass_fail"], axis=1)
y = df_encoded["pass_fail"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_cm = confusion_matrix(y_test, y_pred_lr)

print("=== LOGISTIC REGRESSION RESULTS ===")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)
print("\nConfusion Matrix:\n", lr_cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_cm = confusion_matrix(y_test, y_pred_rf)

print("\n=== RANDOM FOREST RESULTS ===")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print("\nConfusion Matrix:\n", rf_cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


importances = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n=== TOP 15 IMPORTANT FEATURES ===")
print(importances.head(15))



# Grade Distribution
plt.figure(figsize=(6,4))
sns.histplot(df["G3"], kde=True)
plt.title("Final Grade Distribution")
plt.xlabel("G3 (Final Grade)")
plt.ylabel("Count")
plt.savefig("../graph/grade_dist.png", dpi=300)
plt.close()

# Heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df_encoded.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.savefig("../graph/heatmap.png", dpi=300)
plt.close()

# Feature Importance Plot
plt.figure(figsize=(10,6))
plt.barh(importances["feature"][:15], importances["importance"][:15])
plt.title("Top 15 Feature Importances â€“ Random Forest")
plt.gca().invert_yaxis()
plt.savefig("../graph/feature_importance.png", dpi=300)
plt.close()

# Confusion Matrix LR
plt.figure(figsize=(5,4))
sns.heatmap(lr_cm, annot=True, fmt="d", cmap="Blues")
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("../graph/confusion_lr.png", dpi=300)
plt.close()

# Confusion Matrix RF
plt.figure(figsize=(5,4))
sns.heatmap(rf_cm, annot=True, fmt="d", cmap="Greens")
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("../graph/confusion_rf.png", dpi=300)
plt.close()

# Done
print("\nAll figures saved in the /graph folder.")


=== LOGISTIC REGRESSION RESULTS ===
Accuracy: 0.8481012658227848
Precision: 0.9361702127659575
Recall: 0.8301886792452831
F1 Score: 0.88

Confusion Matrix:
 [[23  3]
 [ 9 44]]

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79        26
           1       0.94      0.83      0.88        53

    accuracy                           0.85        79
   macro avg       0.83      0.86      0.84        79
weighted avg       0.86      0.85      0.85        79


=== RANDOM FOREST RESULTS ===
Accuracy: 0.8734177215189873
Precision: 0.9574468085106383
Recall: 0.8490566037735849
F1 Score: 0.9

Confusion Matrix:
 [[24  2]
 [ 8 45]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.92      0.83        26
           1       0.96      0.85      0.90        53

    accuracy                           0.87        79
   macro avg       0.85      0.89      0.86        79
weigh