# Student Performance Prediction
Python | pandas | scikit-learn | matplotlib | seaborn

This notebook is a ready-to-run project for predicting student performance using the UCI Student Performance dataset.

Features:
- Regression: predict final grade (G3)
- Classification: predict pass/fail (G3 >= 10)
- Models: Linear Regression, Random Forest, Logistic Regression, Random Forest Classifier
- Save best models with joblib


## Table of contents
1. Setup & imports
2. Load dataset
3. Quick inspection
4. EDA (extra visuals)
5. Preprocessing
6. Train/test split & scaling
7. Regression models
8. Classification models
9. Save models
10. Conclusions


In [None]:
# Configuration
USE_G1_G2 = True  # Set to False to exclude previous grades (G1, G2)
RANDOM_STATE = 42
TEST_SIZE = 0.2


In [None]:
# Uncomment to install packages if needed
# !pip install pandas numpy scikit-learn matplotlib seaborn joblib requests


In [None]:
import os
import io
import zipfile
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, classification_report, confusion_matrix
)
import joblib

sns.set(style='whitegrid', palette='muted', font_scale=1.1)
%matplotlib inline


In [None]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"

def load_student_data():
    try:
        r = requests.get(DATA_URL)
        r.raise_for_status()
        z = zipfile.ZipFile(io.BytesIO(r.content))
        df = pd.read_csv(z.open('student-mat.csv'), sep=';')
        print("Loaded student-mat.csv from UCI repository.")
        return df
    except Exception as e:
        print("Download failed:", e)
        if os.path.exists('student-mat.csv'):
            print("Loading local student-mat.csv")
            return pd.read_csv('student-mat.csv', sep=';')
        else:
            raise FileNotFoundError("Dataset not found. Please download student-mat.csv to working directory.")

df = load_student_data()
df.head()


In [None]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nInfo:")
df.info()
print("\nDescription (numeric columns):")
display(df.describe().T)
print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:
# Distribution of G3
plt.figure(figsize=(8,4))
sns.histplot(df['G3'], bins=15, kde=True)
plt.title("Distribution of final grade (G3)")
plt.xlabel("G3")
plt.show()

# Pass/fail
df['pass'] = (df['G3'] >= 10).astype(int)
plt.figure(figsize=(5,3))
sns.countplot(x='pass', data=df)
plt.xticks([0,1], ['Fail', 'Pass'])
plt.title("Pass vs Fail")
plt.show()

# Correlation heatmap (numeric)
plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title("Correlation matrix")
plt.show()

# Study time vs G3 boxplot
plt.figure(figsize=(8,5))
sns.boxplot(x='studytime', y='G3', data=df)
plt.title("Study time vs final grade")
plt.show()


In [None]:
# Pairplot for a handful of numeric columns (sample to speed up)
cols = ['G1','G2','G3','studytime','absences']
sample = df[cols].sample(min(200, len(df)), random_state=RANDOM_STATE)
sns.pairplot(sample)
plt.suptitle('Pairplot (sample)')
plt.show()


In [None]:
# Preprocessing
data = df.copy()
binary_cols = ['schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
for col in binary_cols:
    if col in data.columns:
        data[col] = data[col].map({'yes':1, 'no':0})

data = pd.get_dummies(data, drop_first=True)
print("Processed shape:", data.shape)

# Prepare features and targets
target = 'G3'
if not USE_G1_G2:
    if 'G1' in data.columns and 'G2' in data.columns:
        data = data.drop(columns=['G1','G2'])

X = data.drop(columns=['pass', target])
y_reg = data[target]
y_clf = (y_reg >= 10).astype(int)
print("Features shape:", X.shape)


In [None]:
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
_, _, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
def reg_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

lr = LinearRegression()
lr.fit(X_train_scaled, y_train_reg)
y_pred_lr = lr.predict(X_test_scaled)
print("Linear Regression metrics:")
reg_metrics(y_test_reg, y_pred_lr)


In [None]:
rfr = RandomForestRegressor(random_state=RANDOM_STATE)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
}
grid_rfr = GridSearchCV(rfr, param_grid, cv=4, scoring='r2', n_jobs=-1, verbose=1)
grid_rfr.fit(X_train, y_train_reg)
print("Best params:", grid_rfr.best_params_)
best_rfr = grid_rfr.best_estimator_
y_pred_rfr = best_rfr.predict(X_test)
print("Random Forest metrics:")
reg_metrics(y_test_reg, y_pred_rfr)


In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test_reg, y_pred_rfr, alpha=0.6)
plt.plot([0,20],[0,20],'r--')
plt.xlabel("Actual G3")
plt.ylabel("Predicted G3")
plt.title("Predicted vs Actual (Random Forest)")
plt.show()

importances = pd.Series(best_rfr.feature_importances_, index=X.columns)
top_imp = importances.sort_values(ascending=False).head(15)
plt.figure(figsize=(8,6))
sns.barplot(x=top_imp.values, y=top_imp.index)
plt.title("Top 15 feature importances (Random Forest)")
plt.show()


In [None]:
logreg = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
logreg.fit(X_train_scaled, y_train_clf)
y_pred_log = logreg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test_clf, y_pred_log))
print("Classification report:\n", classification_report(y_test_clf, y_pred_log))


In [None]:
rfc = RandomForestClassifier(random_state=RANDOM_STATE)
param_grid_clf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
}
grid_rfc = GridSearchCV(rfc, param_grid_clf, cv=4, scoring='accuracy', n_jobs=-1, verbose=1)
grid_rfc.fit(X_train, y_train_clf)
best_rfc = grid_rfc.best_estimator_
y_pred_rfc = best_rfc.predict(X_test)
print("Random Forest Classifier Accuracy:", accuracy_score(y_test_clf, y_pred_rfc))
print("Classification report:\n", classification_report(y_test_clf, y_pred_rfc))

cm = confusion_matrix(y_test_clf, y_pred_rfc)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fail','Pass'], yticklabels=['Fail','Pass'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Random Forest)")
plt.show()


In [None]:
joblib.dump(best_rfr, 'best_rfr_student.pkl')
joblib.dump(best_rfc, 'best_rfc_student.pkl')
joblib.dump(scaler, 'scaler_student.pkl')
print('Models saved: best_rfr_student.pkl, best_rfc_student.pkl, scaler_student.pkl')


In [None]:
# Quick load and predict sample
loaded_rf_reg = joblib.load('best_rfr_student.pkl')
sample = X_test.iloc[:5]
preds = loaded_rf_reg.predict(sample)
print('Sample predictions (regression):', preds)
print('Actual G3:', list(y_test_reg.iloc[:5]))


## Conclusions & next steps
- Including G1 and G2 makes final-grade prediction easier but less realistic for early intervention.
- Random Forest is a strong baseline for both regression and classification.
- Next: SHAP for explainability, handle class imbalance, add more features, deploy with Streamlit/FastAPI.
