In [92]:
import pandas as pd
import numpy as np

# Pré-processamento

In [93]:
df = pd.read_csv('student_math_clean.csv')
df

Unnamed: 0,student_id,school,sex,age,address_type,family_size,parent_status,mother_education,father_education,mother_job,...,family_relationship,free_time,social,weekday_alcohol,weekend_alcohol,health,absences,grade_1,grade_2,final_grade
0,1,GP,F,18,Urban,Greater than 3,Apart,higher education,higher education,at_home,...,4,3,4,1,1,3,6,5,6,6
1,2,GP,F,17,Urban,Greater than 3,Living together,primary education (4th grade),primary education (4th grade),at_home,...,5,3,3,1,1,3,4,5,5,6
2,3,GP,F,15,Urban,Less than or equal to 3,Living together,primary education (4th grade),primary education (4th grade),at_home,...,4,3,2,2,3,3,10,7,8,10
3,4,GP,F,15,Urban,Greater than 3,Living together,higher education,5th to 9th grade,health,...,3,2,2,1,1,5,2,15,14,15
4,5,GP,F,16,Urban,Greater than 3,Living together,secondary education,secondary education,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,391,MS,M,20,Urban,Less than or equal to 3,Apart,5th to 9th grade,5th to 9th grade,services,...,5,5,4,4,5,4,11,9,9,9
391,392,MS,M,17,Urban,Less than or equal to 3,Living together,secondary education,primary education (4th grade),services,...,2,4,5,3,4,2,3,14,16,16
392,393,MS,M,21,Rural,Greater than 3,Living together,primary education (4th grade),primary education (4th grade),other,...,5,5,3,3,3,3,3,10,8,7
393,394,MS,M,18,Rural,Less than or equal to 3,Living together,secondary education,5th to 9th grade,services,...,4,4,1,3,4,5,0,11,12,10


In [94]:
df = df.drop(
    columns=[
        'student_id',
        'age',
        'sex',
        'address_type',
        'family_size',
        'mother_job',
        'father_job',
        'school_choice_reason',
        'guardian',
        'romantic_relationship',
        # 'grade_1',
        # 'grade_2',
    ]
)

In [95]:
df = pd.get_dummies(df, columns=['school', 'parent_status'])

df['mother_education'] = df['mother_education'].map({'none': 0, 'primary education (4th grade)': 1, 'secondary education': 2, '5th to 9th grade': 3, 'higher education': 4})
df['father_education'] = df['father_education'].map({'none': 0, 'primary education (4th grade)': 1, 'secondary education': 2, '5th to 9th grade': 3, 'higher education': 4})
df['travel_time'] = df['travel_time'].map({'<15 min.': 0, '15 to 30 min.': 1, '30 min. to 1 hour': 2, '>1 hour': 3})
df['study_time'] = df['study_time'].map({'<2 hours': 0, '2 to 5 hours': 1, '5 to 10 hours': 2, '>10 hours': 3})

df['school_support'] = df['school_support'].map({'yes': 1, 'no': 0})
df['family_support'] = df['family_support'].map({'yes': 1, 'no': 0})
df['extra_paid_classes'] = df['extra_paid_classes'].map({'yes': 1, 'no': 0})
df['activities'] = df['activities'].map({'yes': 1, 'no': 0})
df['nursery_school'] = df['nursery_school'].map({'yes': 1, 'no': 0})
df['higher_ed'] = df['higher_ed'].map({'yes': 1, 'no': 0})
df['internet_access'] = df['internet_access'].map({'yes': 1, 'no': 0})

df

Unnamed: 0,mother_education,father_education,travel_time,study_time,class_failures,school_support,family_support,extra_paid_classes,activities,nursery_school,...,weekend_alcohol,health,absences,grade_1,grade_2,final_grade,school_GP,school_MS,parent_status_Apart,parent_status_Living together
0,4,4,1,1,0,1,0,0,0,1,...,1,3,6,5,6,6,True,False,True,False
1,1,1,0,1,0,0,1,0,0,0,...,1,3,4,5,5,6,True,False,False,True
2,1,1,0,1,3,1,0,1,0,1,...,3,3,10,7,8,10,True,False,False,True
3,4,3,0,2,0,0,1,1,1,1,...,1,5,2,15,14,15,True,False,False,True
4,2,2,0,1,0,0,1,1,0,1,...,2,5,4,6,10,10,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,3,3,0,1,2,0,1,1,0,1,...,5,4,11,9,9,9,False,True,True,False
391,2,1,1,0,0,0,0,0,0,0,...,4,2,3,14,16,16,False,True,False,True
392,1,1,0,0,3,0,0,0,0,0,...,3,3,3,10,8,7,False,True,False,True
393,2,3,2,0,0,0,0,0,0,0,...,4,5,0,11,12,10,False,True,False,True


In [96]:
features = [col for col in df.columns if col != 'final_grade']

X = df[features].values
y = df['final_grade'].values

# Modelagem

In [97]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures

In [98]:
lin_reg_model = LinearRegression()

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
poly_reg_model = LinearRegression()

lasso_reg_model = Lasso(alpha=0.1)

ridge_reg_model = Ridge(alpha=0.1)

elastic_reg_model = ElasticNet(alpha=0.1, l1_ratio=0.5)

# Avaliação dos modelos

In [99]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [100]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [103]:
metrics = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Métricas de avaliação do modelo de regressão linear
lin_reg_model_metrics = cross_validate(lin_reg_model, X, y, cv=kf, scoring=metrics)

mae_lin_reg_model = -lin_reg_model_metrics['test_neg_mean_absolute_error'].mean().round(3)
mse_lin_reg_model = -lin_reg_model_metrics['test_neg_mean_squared_error'].mean().round(3)
r2_lin_reg_model = lin_reg_model_metrics['test_r2'].mean().round(3)

# Métricas de avaliação do modelo de regressão polinomial
poly_reg_model_metrics = cross_validate(poly_reg_model, X_poly, y, cv=kf, scoring=metrics)

mae_poly_reg_model = -poly_reg_model_metrics['test_neg_mean_absolute_error'].mean().round(3)
mse_poly_reg_model = -poly_reg_model_metrics['test_neg_mean_squared_error'].mean().round(3)
r2_poly_reg_model = poly_reg_model_metrics['test_r2'].mean().round(3)

# Métricas de avaliação do modelo de regressão LassoLars
lasso_reg_model_metrics = cross_validate(lasso_reg_model, X, y, cv=kf, scoring=metrics)

mae_lasso_reg_model = -lasso_reg_model_metrics['test_neg_mean_absolute_error'].mean().round(3)
mse_lasso_reg_model = -lasso_reg_model_metrics['test_neg_mean_squared_error'].mean().round(3)
r2_lasso_reg_model = lasso_reg_model_metrics['test_r2'].mean().round(3)

# Métricas de avaliação do modelo de regressão Ridge
ridge_reg_model_metrics = cross_validate(ridge_reg_model, X, y, cv=kf, scoring=metrics)

mae_ridge_reg_model = -ridge_reg_model_metrics['test_neg_mean_absolute_error'].mean().round(3)
mse_ridge_reg_model = -ridge_reg_model_metrics['test_neg_mean_squared_error'].mean().round(3)
r2_ridge_reg_model = ridge_reg_model_metrics['test_r2'].mean().round(3)

# Métricas de avaliação do modelo de regressão ElasticNet
elastic_reg_model_metrics = cross_validate(elastic_reg_model, X, y, cv=kf, scoring=metrics)

mae_elastic_reg_model = -elastic_reg_model_metrics['test_neg_mean_absolute_error'].mean().round(3)
mse_elastic_reg_model = -elastic_reg_model_metrics['test_neg_mean_squared_error'].mean().round(3)
r2_elastic_reg_model = elastic_reg_model_metrics['test_r2'].mean().round(3)

In [104]:
print('Linear Regression Model')
print(f'MAE: {mae_lin_reg_model}')
print(f'MSE: {mse_lin_reg_model}')
print(f'R2: {r2_lin_reg_model}')

print('\nPolynomial Regression Model')
print(f'MAE: {mae_poly_reg_model}')
print(f'MSE: {mse_poly_reg_model}')
print(f'R2: {r2_poly_reg_model}')

print('\nLassoLars Regression Model')
print(f'MAE: {mae_lasso_reg_model}')
print(f'MSE: {mse_lasso_reg_model}')
print(f'R2: {r2_lasso_reg_model}')

print('\nRidge Regression Model')
print(f'MAE: {mae_ridge_reg_model}')
print(f'MSE: {mse_ridge_reg_model}')
print(f'R2: {r2_ridge_reg_model}')

print('\nElasticNet Regression Model')
print(f'MAE: {mae_elastic_reg_model}')
print(f'MSE: {mse_elastic_reg_model}')
print(f'R2: {r2_elastic_reg_model}')

Linear Regression Model
MAE: 1.255
MSE: 3.805
R2: 0.812

Polynomial Regression Model
MAE: 5.002
MSE: 56.744
R2: -2.108

LassoLars Regression Model
MAE: 1.157
MSE: 3.657
R2: 0.823

Ridge Regression Model
MAE: 1.255
MSE: 3.804
R2: 0.812

ElasticNet Regression Model
MAE: 1.172
MSE: 3.656
R2: 0.822
