In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import statsmodels as sm
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cluster import KMeans
import joblib
sb.set()

In [13]:
df_ProbStat = pd.read_csv('Datasets/ProbStat.csv')
df_ProbStat.head()

Unnamed: 0,Year,Semester,Exam 1,Exam 2,Exam 3,Homework,Attendance,FinalExam,FinalGrade
0,2016,Fall,70,87,70,63.0,99.0,90,84
1,2016,Fall,60,61,45,46.0,30.0,10,47
2,2016,Fall,80,68,30,56.0,91.0,88,73
3,2016,Fall,70,70,55,54.0,98.0,60,70
4,2016,Fall,95,98,93,75.0,100.0,81,94


In [None]:
df_ProbStat = df_ProbStat.rename(columns={'Exam 1':'Participation'})
df_ProbStat['Exam'] = (df_ProbStat['Exam 2'] + df_ProbStat['Exam 3'] + df_ProbStat['FinalExam'])/3
df_ProbStat['Year'] = (df_ProbStat['Year'] - 2015).clip(upper=6)
df_ProbStat.describe(include='all')

Unnamed: 0,Year,Semester,Participation,Exam 2,Exam 3,Homework,Attendance,FinalExam,FinalGrade,Exam
count,738.0,738,738.0,738.0,738.0,738.0,738.0,738.0,738.0,738.0
unique,,2,,,,,,,,
top,,Spring,,,,,,,,
freq,,372,,,,,,,,
mean,2018.628726,,74.253388,70.50542,70.390244,82.386009,86.387127,74.752033,77.49729,71.882565
std,2.05034,,17.43597,22.120077,21.358655,16.939144,16.214947,20.296634,13.522808,16.801626
min,2016.0,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0
25%,2017.0,,62.0,56.0,58.0,77.0,82.0,65.0,70.0,63.083333
50%,2018.0,,75.0,74.0,75.0,88.0,92.0,80.0,79.5,73.333333
75%,2020.0,,87.0,90.0,88.0,93.0,97.0,90.0,89.0,85.0


In [17]:
eliminar = ['Semester','Exam 2','Exam 3','FinalExam']
df = df_ProbStat.drop(columns=eliminar)
df.isnull().mean()

Year             0.0
Participation    0.0
Homework         0.0
Attendance       0.0
FinalGrade       0.0
Exam             0.0
dtype: float64

In [18]:
df.head()

Unnamed: 0,Year,Participation,Homework,Attendance,FinalGrade,Exam
0,2016,70,63.0,99.0,84,82.333333
1,2016,60,46.0,30.0,47,38.666667
2,2016,80,56.0,91.0,73,62.0
3,2016,70,54.0,98.0,70,61.666667
4,2016,95,75.0,100.0,94,90.666667


In [19]:
# 10. Preparar variables predictoras y objetivo
X = df.drop(columns=['FinalGrade'])
y = df['FinalGrade']

# 11. Separar en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# 12. Entrenar modelo
model = LinearRegression()
model.fit(X_train, y_train)

In [21]:
# 13. Evaluar
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 14. Resultados
print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R2 Score: {r2:.2f}')

MAE: 1.88
RMSE: 6.13
R2 Score: 0.96


In [24]:
scores = cross_val_score(model, X, y, scoring='r2', cv=10)
print(f'R2 promedio: {scores.mean():.2f} ± {scores.std():.2f}')

R2 promedio: 0.97 ± 0.02


In [25]:
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge = Ridge()
grid = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
grid.fit(X, y)

print(f"Mejor alpha: {grid.best_params_['alpha']}")
print(f"R2 con Ridge: {grid.best_score_:.2f}")

Mejor alpha: 100
R2 con Ridge: 0.97


In [27]:
lr = LinearRegression()
lr.fit(X_train, y_train)

# Modelo con regularización (Ridge)
ridge = Ridge(alpha=100)
ridge.fit(X_train, y_train)

In [28]:
coef_lr = lr.coef_
coef_ridge = ridge.coef_
features = X.columns


In [29]:
coef_comparison = pd.DataFrame({
    'Feature': features,
    'LinearRegression': coef_lr,
    'Ridge (α=100)': coef_ridge
})
print(coef_comparison)

         Feature  LinearRegression  Ridge (α=100)
0           Year         -0.145885      -0.138972
1  Participation          0.168235       0.168312
2       Homework          0.160890       0.160737
3     Attendance          0.087221       0.087595
4           Exam          0.571708       0.571164


In [33]:
# Guardar el modelo
joblib.dump(ridge, 'modelo_segundo_parcial.pkl')

['modelo_segundo_parcial.pkl']