In [138]:
import pandas as pd
import numpy as np

In [139]:
# Headers derrived from: https://doi.org/10.7910/DVN/O35FW8
column_headers = ([
  "gender", 
  "race", 
  "physics", 
  "biology", 
  "history", 
  "foreign language", 
  "geography", 
  "literature", 
  "portuguese",
  "math",
  "chemistry",
  "mean GPA"
  ])

students = pd.read_csv("data/UFRGS_exam_gpa.csv", header = 0, names = column_headers)
students.head()

Unnamed: 0,gender,race,physics,biology,history,foreign language,geography,literature,portuguese,math,chemistry,mean GPA
0,1,White,538.0,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,2.98333
1,1,White,455.18,440.0,570.86,417.54,453.53,425.87,475.63,476.11,407.15,1.97333
2,0,White,756.91,679.62,531.28,583.63,534.42,521.4,592.41,783.76,588.26,2.53333
3,1,White,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,1.58667
4,1,White,325.99,466.74,597.06,554.43,535.77,717.03,477.6,503.82,422.92,1.66667


In [140]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(students, test_size=0.2, random_state=2024)

In [141]:
students = train_set.copy()

In [142]:
corr_matrix = students.corr(numeric_only=True)
corr_matrix['mean GPA'].sort_values(ascending=False)


mean GPA            1.000000
literature          0.328419
portuguese          0.272709
biology             0.238810
history             0.224719
foreign language    0.214333
chemistry           0.201278
geography           0.177406
physics             0.172503
math                0.142776
gender             -0.213094
Name: mean GPA, dtype: float64

In [143]:
students = train_set.copy().drop(columns=['mean GPA'])
students_labels = train_set['mean GPA'].copy()

In [144]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [145]:
students_num = students.select_dtypes(include=[np.number])

In [146]:
imputer.fit(students_num)
X = imputer.transform(students_num)


In [147]:
students_tr = pd.DataFrame(X,columns=students_num.columns,index=students_num.index)

In [148]:
students_cat = students[['gender', 'race']]
students_cat.head()

Unnamed: 0,gender,race
29335,1,White
11787,0,White
3335,0,Black
10774,0,White
25812,1,White


In [149]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
students_cat_1hot = cat_encoder.fit_transform(students_cat.drop(columns=['gender']))

In [150]:
cat_encoder.categories_

[array(['Asian', 'Black', 'Indigenous', 'Parda', 'White'], dtype=object)]

In [151]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
students_num_std_scaled = std_scaler.fit_transform(students_num)

In [152]:
#Data Transformation Pipeline

from sklearn.pipeline import Pipeline

num_attribs = ["physics", "biology", "history", "foreign language", "geography", "literature", "portuguese","math","chemistry"]

num_pipeline = Pipeline([
  ("impute", SimpleImputer(strategy="median")),
  ("standardize", StandardScaler())
])

from sklearn.pipeline import make_pipeline

cat_attribs = ["gender", "race"]

cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"),
  OneHotEncoder(handle_unknown="ignore")
)

from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
  ("num", num_pipeline, num_attribs),
  ("cat", cat_pipeline, cat_attribs)
])

In [157]:
students_prepared = preprocessing.fit_transform(students)
students_prepared.shape

(34641, 16)

In [158]:
preprocessing.get_feature_names_out()

array(['num__physics', 'num__biology', 'num__history',
       'num__foreign language', 'num__geography', 'num__literature',
       'num__portuguese', 'num__math', 'num__chemistry', 'cat__gender_0',
       'cat__gender_1', 'cat__race_Asian', 'cat__race_Black',
       'cat__race_Indigenous', 'cat__race_Parda', 'cat__race_White'],
      dtype=object)

In [155]:
from sklearn.linear_model import LinearRegression
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(students, students_labels)

In [159]:
students_predictions = lin_reg.predict(students)

In [160]:
students_predictions[:5].round(-2)

array([0., 0., 0., 0., 0.])

In [161]:
students_labels.iloc[:5].values

array([2.89   , 3.75333, 3.08667, 3.24   , 2.5    ])

In [162]:
from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(students_labels, students_predictions, squared=False)

lin_rmse



0.7435410426396815

In [163]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=2024))
tree_reg.fit(students, students_labels)

In [164]:
students_predictions = tree_reg.predict(students)

In [165]:
tree_rmse = mean_squared_error(students_labels, students_predictions, squared=False)

tree_rmse



0.0

In [166]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

In [167]:
pd.Series(tree_rmses).describe()

count    10.000000
mean      1.067262
std       0.016276
min       1.037282
25%       1.056335
50%       1.068954
75%       1.081284
max       1.087024
dtype: float64

In [168]:
lin_rmses =  -cross_val_score(lin_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

pd.Series(lin_rmses).describe()

count    10.000000
mean      0.743820
std       0.012161
min       0.727114
25%       0.733845
50%       0.743838
75%       0.752912
max       0.764841
dtype: float64

In [169]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=2024))

forest_rmses = -cross_val_score(forest_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

In [170]:
pd.Series(forest_rmses).describe()

count    10.000000
mean      0.749840
std       0.012067
min       0.732367
25%       0.741246
50%       0.751803
75%       0.755250
max       0.774029
dtype: float64