In [2]:
# 🎓 Student Performance Prediction with Regression and Classification

# This notebook explores simple, multiple, and polynomial regression, as well as logistic regression, to analyze and predict student performance from the `student-mat.csv` dataset.


In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, StandardScaler

# Load dataset
df = pd.read_csv('student-mat.csv', delimiter=';')
df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
# SIMPLE LINEAR REGRESSION: Predict G3 using G2

X = df[['G2']]
y = df['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("🔹 Simple Linear Regression")
print(f'SSE (train): {sum((y_train - y_pred_train)**2)}')
print(f'SSE (test): {sum((y_test - y_pred_test)**2)}')
print(f'R² (train): {model.score(X_train, y_train)}')
print(f'R² (test): {model.score(X_test, y_test)}')


🔹 Simple Linear Regression
SSE (train): 1263.5902872303611
SSE (test): 235.5809636772549
R² (train): 0.8242442660656965
R² (test): 0.7818848166971013


In [5]:
# MULTIPLE LINEAR REGRESSION: Use more features

encoder = OrdinalEncoder(dtype=int, categories=[['T', 'A']])
df['Pstatus'] = encoder.fit_transform(df[['Pstatus']]).ravel()

X = df[['G1', 'G2', 'famrel', 'studytime', 'Pstatus']]
y = df['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("🔹 Multiple Linear Regression")
print(f'SSE (train): {sum((y_train - y_pred_train)**2)}')
print(f'SSE (test): {sum((y_test - y_pred_test)**2)}')
print(f'R² (train): {model.score(X_train, y_train)}')
print(f'R² (test): {model.score(X_test, y_test)}')


🔹 Multiple Linear Regression
SSE (train): 1189.6711382014437
SSE (test): 234.6145571181106
R² (train): 0.8345258537137408
R² (test): 0.7827795746626968


In [6]:
# POLYNOMIAL REGRESSION: Use scaled G1 and add polynomial terms

X = df[['G1']]
y = df['G3']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1, test_size=0.2)

transformer = PolynomialFeatures(degree=2, include_bias=False)
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

model = LinearRegression()
model.fit(X_train_trans, y_train)

y_pred_train = model.predict(X_train_trans)
y_pred_test = model.predict(X_test_trans)

print("🔹 Polynomial Regression (degree=2)")
print(f'SSE (train): {sum((y_train - y_pred_train)**2)}')
print(f'SSE (test): {sum((y_test - y_pred_test)**2)}')
print(f'R² (train): {model.score(X_train_trans, y_train)}')
print(f'R² (test): {model.score(X_test_trans, y_test)}')


🔹 Polynomial Regression (degree=2)
SSE (train): 2517.9890608322144
SSE (test): 444.62209176965655
R² (train): 0.649767001299818
R² (test): 0.5883418272296503


In [7]:
# CLASSIFICATION: Logistic Regression to predict pass/fail

df['passed'] = (df['G3'] >= 10).astype(int)

X = df[['G1', 'G2']]
y = df['passed']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

# Model with default C
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

print("🔹 Logistic Regression (C=1.0)")
print(f'Confusion matrix:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test)}')


🔹 Logistic Regression (C=1.0)
Confusion matrix:
[[16  8]
 [ 1 54]]
Accuracy: 0.8860759493670886


In [8]:
# Logistic Regression with C = 10 (less regularization)

model = LogisticRegression(solver='liblinear', random_state=1, C=10.0)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

print("🔹 Logistic Regression (C=10.0)")
print(f'Confusion matrix:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test)}')


🔹 Logistic Regression (C=10.0)
Confusion matrix:
[[21  3]
 [ 4 51]]
Accuracy: 0.9113924050632911


In [None]:
## ✅ Conclusions

- **G2** is a strong predictor of final grade (`G3`) even in a simple linear model.
- Adding more features improves regression performance slightly.
- Polynomial regression overfits and performs worse, showing it's not the best fit here.
- Logistic regression classifies students as pass/fail with high accuracy, and tuning `C` improves results.
