In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/student-data.csv")

In [3]:
# Prepare the data
df["pass"] = (df["G3"] >= 10).astype(int)

features = ["studytime", "absences", "G1", "G2"]
X = df[features].values
y = df["pass"].values


In [4]:
# Train-Test Split + Scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
# Train Logistic Regression Model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

accuracy_score(y_test, y_pred_lr), confusion_matrix(y_test, y_pred_lr)

(0.8987341772151899,
 array([[25,  2],
        [ 6, 46]]))

In [6]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

accuracy_score(y_test, y_pred_dt), confusion_matrix(y_test, y_pred_dt)

(0.8987341772151899,
 array([[23,  4],
        [ 4, 48]]))

In [7]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

accuracy_score(y_test, y_pred_rf), confusion_matrix(y_test, y_pred_rf)

(0.8860759493670886,
 array([[23,  4],
        [ 5, 47]]))

In [8]:
#Feature Importance

import pandas as pd

importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance

Unnamed: 0,Feature,Importance
3,G2,0.53428
2,G1,0.299923
1,absences,0.121686
0,studytime,0.044111
