**GPAPredictor: Data Analysis & Regression Models**

*This notebook contains a basic analysis of "UFRGS Entrance Exam and GPA Data", as provided in the Harvard Dataverse (https://doi.org/10.7910/DVN/O35FW8).*
*Format adapted from example notebooks in "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 3rd Edition" by Geron Aurelien*

##### Import Pandas & NumPy

In [317]:
import pandas as pd
import numpy as np

##### Download Data

In [318]:
# Column headers specified for the dataset (from: https://doi.org/10.7910/DVN/O35FW8)
column_headers = ([
  "gender", 
  "race", 
  "physics", 
  "biology", 
  "history", 
  "foreign language", 
  "geography", 
  "literature", 
  "portuguese",
  "math",
  "chemistry",
  "mean GPA"
  ])

students = pd.read_csv("data/UFRGS_exam_gpa.csv", header = 0, names = column_headers)

In [319]:
#Fix dataset inconsistency: "gender" is only categorical variable to be stored as an indicator.
def num_to_gender (x):
  if x == 0:
    return "female"
  return "male"

students["gender"] = students["gender"].apply(lambda x: num_to_gender (x))

##### Summary Statistics for Dataset

In [320]:
students.head()


Unnamed: 0,gender,race,physics,biology,history,foreign language,geography,literature,portuguese,math,chemistry,mean GPA
0,male,White,538.0,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,2.98333
1,male,White,455.18,440.0,570.86,417.54,453.53,425.87,475.63,476.11,407.15,1.97333
2,female,White,756.91,679.62,531.28,583.63,534.42,521.4,592.41,783.76,588.26,2.53333
3,male,White,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,1.58667
4,male,White,325.99,466.74,597.06,554.43,535.77,717.03,477.6,503.82,422.92,1.66667


In [321]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43302 entries, 0 to 43301
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            43302 non-null  object 
 1   race              43302 non-null  object 
 2   physics           43302 non-null  float64
 3   biology           43302 non-null  float64
 4   history           43302 non-null  float64
 5   foreign language  43302 non-null  float64
 6   geography         43302 non-null  float64
 7   literature        43302 non-null  float64
 8   portuguese        43302 non-null  float64
 9   math              43302 non-null  float64
 10  chemistry         43302 non-null  float64
 11  mean GPA          43302 non-null  float64
dtypes: float64(10), object(2)
memory usage: 4.0+ MB


In [322]:
students.describe()

Unnamed: 0,physics,biology,history,foreign language,geography,literature,portuguese,math,chemistry,mean GPA
count,43302.0,43302.0,43302.0,43302.0,43302.0,43302.0,43302.0,43302.0,43302.0,43302.0
mean,576.121593,568.661923,580.830137,573.98996,574.492491,583.302685,551.03731,579.191492,571.712809,2.785761
std,115.154414,101.444116,94.213121,86.87083,90.757163,92.896964,87.147365,114.681658,112.171239,0.820736
min,299.34,262.99,265.02,222.71,224.87,239.11,151.59,297.99,300.47,0.0
25%,482.79,492.4,516.1,517.7,510.23,516.77,491.88,489.41,484.54,2.28
50%,565.61,566.44,578.94,580.28,575.47,587.07,553.57,571.89,565.51,2.92
75%,662.8,634.78,650.19,640.56,637.27,648.67,613.06,665.16,655.42,3.43
max,952.09,966.57,925.76,858.44,941.84,904.77,825.53,1072.12,1001.9,4.0


##### Create Training Set & Test Set

In [323]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(students, test_size=0.2, random_state=2024)

##### Initial Data Analysis

In [324]:
students = train_set.copy()

In [325]:
corr_matrix = students.corr(numeric_only=True)
corr_matrix['mean GPA'].sort_values(ascending=False)

mean GPA            1.000000
literature          0.328419
portuguese          0.272709
biology             0.238810
history             0.224719
foreign language    0.214333
chemistry           0.201278
geography           0.177406
physics             0.172503
math                0.142776
Name: mean GPA, dtype: float64

##### Data Preparation

In [326]:
#Numerical Data

students = train_set.copy().drop(columns=['mean GPA'])
students_labels = train_set['mean GPA'].copy()

In [327]:
students_num = students.select_dtypes(include=[np.number])

students_num.head()

Unnamed: 0,physics,biology,history,foreign language,geography,literature,portuguese,math,chemistry
29335,728.62,626.78,630.09,662.98,575.47,533.92,542.98,703.57,682.95
11787,550.64,633.67,655.98,609.32,627.47,622.67,607.45,615.31,569.69
3335,592.51,518.06,623.63,488.25,535.77,669.2,430.5,541.01,496.39
10774,480.19,419.95,386.96,540.72,474.34,533.15,377.11,467.43,390.87
25812,638.12,440.56,508.03,646.77,635.05,454.4,551.47,661.89,726.61


In [328]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

imputer.fit(students_num)
X = imputer.transform(students_num)


In [329]:
#Data Scaling

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

students_num_std_scaled = std_scaler.fit_transform(students_num)

In [330]:
#Categorical Data

students_cat = students[['gender', 'race']]

students_cat.head()

Unnamed: 0,gender,race
29335,male,White
11787,female,White
3335,female,Black
10774,female,White
25812,male,White


In [331]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()

students_cat_1hot = cat_encoder.fit_transform(students_cat)

In [332]:
cat_encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['Asian', 'Black', 'Indigenous', 'Parda', 'White'], dtype=object)]

##### Transformation Pipeline

In [333]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [334]:
num_attribs = ["physics", "biology", "history", "foreign language", "geography", "literature", "portuguese","math","chemistry"]

num_pipeline = Pipeline([
  ("impute", SimpleImputer(strategy="median")),
  ("standardize", StandardScaler())
])


In [335]:
cat_attribs = ["gender", "race"]

cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"),
  OneHotEncoder(handle_unknown="ignore")
)

In [336]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
  ("num", num_pipeline, num_attribs),
  ("cat", cat_pipeline, cat_attribs)
])

In [337]:
students_prepared = preprocessing.fit_transform(students)

In [338]:
students_prepared.shape

(34641, 16)

In [339]:
preprocessing.get_feature_names_out()

array(['num__physics', 'num__biology', 'num__history',
       'num__foreign language', 'num__geography', 'num__literature',
       'num__portuguese', 'num__math', 'num__chemistry',
       'cat__gender_female', 'cat__gender_male', 'cat__race_Asian',
       'cat__race_Black', 'cat__race_Indigenous', 'cat__race_Parda',
       'cat__race_White'], dtype=object)

##### Linear-Regression Model

In [340]:
from sklearn.linear_model import LinearRegression
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(students, students_labels)

##### Decision-Tree Model

In [341]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=2024))
tree_reg.fit(students, students_labels)

##### Random-Forest Model

In [342]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=2024))


##### Model Evaluation using Cross-Validation

In [343]:
from sklearn.model_selection import cross_val_score

In [344]:
tree_rmses = -cross_val_score(tree_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

pd.Series(tree_rmses).describe()

count    10.000000
mean      1.067262
std       0.016276
min       1.037282
25%       1.056335
50%       1.068954
75%       1.081284
max       1.087024
dtype: float64

In [345]:
lin_rmses =  -cross_val_score(lin_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

pd.Series(lin_rmses).describe()

count    10.000000
mean      0.743820
std       0.012161
min       0.727114
25%       0.733845
50%       0.743838
75%       0.752912
max       0.764841
dtype: float64

In [348]:
forest_rmses = -cross_val_score(forest_reg,students,students_labels, scoring="neg_root_mean_squared_error",cv=10)

pd.Series(forest_rmses).describe()

count    10.000000
mean      0.749840
std       0.012067
min       0.732367
25%       0.741246
50%       0.751803
75%       0.755250
max       0.774029
dtype: float64