In [104]:
import pandas as pd
import numpy as np

In [105]:
# Headers derrived from: https://doi.org/10.7910/DVN/O35FW8
column_headers = ([
  "gender", 
  "race", 
  "physics", 
  "biology", 
  "history", 
  "foreign language", 
  "geography", 
  "literature", 
  "portuguese",
  "math",
  "chemistry",
  "mean GPA"
  ])

students = pd.read_csv("data/UFRGS_exam_gpa.csv", header = 0, names = column_headers)
students.head()

Unnamed: 0,gender,race,physics,biology,history,foreign language,geography,literature,portuguese,math,chemistry,mean GPA
0,1,White,538.0,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,2.98333
1,1,White,455.18,440.0,570.86,417.54,453.53,425.87,475.63,476.11,407.15,1.97333
2,0,White,756.91,679.62,531.28,583.63,534.42,521.4,592.41,783.76,588.26,2.53333
3,1,White,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,1.58667
4,1,White,325.99,466.74,597.06,554.43,535.77,717.03,477.6,503.82,422.92,1.66667


In [106]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(students, test_size=0.2, random_state=2024)

In [107]:
students = train_set.copy()

In [108]:
corr_matrix = students.corr(numeric_only=True)
corr_matrix['mean GPA'].sort_values(ascending=False)


mean GPA            1.000000
literature          0.328419
portuguese          0.272709
biology             0.238810
history             0.224719
foreign language    0.214333
chemistry           0.201278
geography           0.177406
physics             0.172503
math                0.142776
gender             -0.213094
Name: mean GPA, dtype: float64

In [109]:
students = train_set.copy().drop(columns=['mean GPA'])
students_labels = train_set['mean GPA'].copy()

In [110]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [112]:
students_num = students.select_dtypes(include=[np.number])

In [115]:
imputer.fit(students_num)
X = imputer.transform(students_num)


In [119]:
students_tr = pd.DataFrame(X,columns=students_num.columns,index=students_num.index)

In [116]:
students_cat = students[['gender', 'race']]
students_cat.head()

Unnamed: 0,gender,race
29335,1,White
11787,0,White
3335,0,Black
10774,0,White
25812,1,White


In [125]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
students_cat_1hot = cat_encoder.fit_transform(students_cat.drop(columns=['gender']))

In [126]:
cat_encoder.categories_

[array(['Asian', 'Black', 'Indigenous', 'Parda', 'White'], dtype=object)]

In [127]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
students_num_std_scaled = std_scaler.fit_transform(students_num)

In [135]:
#Data Transformation Pipeline

from sklearn.pipeline import Pipeline

num_attribs = ["physics", "biology", "history", "foreign language", "geography", "literature", "portuguese","math","chemistry"]

num_pipeline = Pipeline([
  ("impute", SimpleImputer(strategy="median")),
  ("standardize", StandardScaler())
])

from sklearn.pipeline import make_pipeline

cat_attribs = ["gender", "race"]

cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"),
  OneHotEncoder(handle_unknown="ignore")
)

from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
  ("num", num_pipeline, num_attribs),
  ("cat", cat_pipeline, cat_attribs)
])

In [136]:
students_prepared = preprocessing.fit_transform(students)