# 🎓 Student Performance Prediction (Enhanced Visuals)
Predict student math scores using multiple regression models with evaluation and visualization.
Includes model comparison, residual analysis, and feature importance plots.

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = os.path.join('..', 'data', 'c9a4f67d-125f-415a-b439-52a37bbd7c2f.csv')
df = pd.read_csv(data_path)
df.head()

In [None]:
ordinal_cols = ['parental level of education', 'lunch', 'test preparation course']
categories = [['some college','some high school', 'high school', "associate's degree", "bachelor's degree", "master's degree"],
             ['free/reduced', 'standard'],
             ['none', 'completed']]
enc_ord = OrdinalEncoder(categories=categories)
df[ordinal_cols] = enc_ord.fit_transform(df[ordinal_cols])
enc_oh = OneHotEncoder(sparse_output=False)
encoded = enc_oh.fit_transform(df[['gender', 'race/ethnicity']])
df[enc_oh.get_feature_names_out()] = encoded
df.drop(columns=['gender','race/ethnicity'], inplace=True)

In [None]:
y = df['math score']
X = df.drop(['math score','reading score','writing score'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.1),
    'Lasso Regression': Lasso(alpha=0.1),
    'Decision Tree': tree.DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

results = {}
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    predictions[name] = preds
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    results[name] = {'MSE': mse, 'MAE': mae}

results_df = pd.DataFrame(results).T.sort_values('MSE')
results_df

In [None]:
results_df.plot(kind='bar', figsize=(10,5))
plt.title('Model Comparison: MSE & MAE')
plt.ylabel('Error')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], '--r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Predicted vs Actual (Linear Regression)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(8,6))
sns.histplot(residuals, kde=True)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(8,6))
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.grid(True)
plt.tight_layout()
plt.show()