## Import 

In [7]:
from merged import merge_dataframes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Description

**school** - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)  
**sex** - student's sex (binary: 'F' - female or 'M' - male)  
**age** - student's age (numeric: from 15 to 22)  
**address** - student's home address type (binary: 'U' - urban or 'R' - rural)  
**famsize** - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)  
**Pstatus** - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)  
**Medu** - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)  
**Fedu** - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)  
**Mjob** - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')  
**Fjob** - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')  
**reason** - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')  
**guardian** - student's guardian (nominal: 'mother', 'father' or 'other')  
**traveltime** - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)  
**studytime** - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)  
**failures** - number of past class failures (numeric: n if 1<=n<3, else 4)  
**schoolsup** - extra educational support (binary: yes or no)  
**famsup** - family educational support (binary: yes or no)  
**paid** - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)  
**activities** - extra-curricular activities (binary: yes or no)  
**nursery** - attended nursery school (binary: yes or no)  
**higher** - wants to take higher education (binary: yes or no)  
**internet** - Internet access at home (binary: yes or no)  
**romantic** - with a romantic relationship (binary: yes or no)  
**famrel** - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)  
**freetime** - free time after school (numeric: from 1 - very low to 5 - very high)  
**goout** - going out with friends (numeric: from 1 - very low to 5 - very high)  
**Dalc** - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)  
**Walc** - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)  
**health** - current health status (numeric: from 1 - very bad to 5 - very good)  
**absences** - number of school absences (numeric: from 0 to 93)  

These grades are related to the course subject, Math or Portuguese:

**G1** - first period grade (numeric: from 0 to 20)  
**G2** - second period grade (numeric: from 0 to 20)  
**G3** - final grade (numeric: from 0 to 20, output target)


## Machine Learning process

We import the dataframes and use our function merge_dataframes() to merge the two dataframes.

In [250]:
df_mat = pd.read_csv('data/student-mat.csv')
df_por = pd.read_csv('data/student-por.csv')
df = merge_dataframes(df_mat, df_por)
df.head(2)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,G3_mat,G3_por
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4.0,3.0,4.0,1.0,1.0,3.0,6,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5.0,3.0,3.0,1.0,1.0,3.0,6,11


In [9]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'G3_mat', 'G3_por'],
      dtype='object')

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn import set_config; set_config(display='diagram')

We separate our X from our Y.

In [184]:
X = df.drop(columns=['G3_mat', "G3_por"])
y1 = df['G3_mat']
y2 = df['G3_por']

Setting up lists to sort our variables based on their description.

In [41]:
ordinal_cols = [
    'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
    'freetime', 'goout', 'Dalc', 'Walc', 'health'
]
categorical_cols = [
    'school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup',
    'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'Mjob',
    'Fjob', 'reason', 'guardian'
]

numerical_cols = ['age']

###### Ordinal Pipeline

In [95]:
ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))

In [96]:
ordi = ordinal_pipeline.fit_transform(X[ordinal_cols])
pd.DataFrame(ordi, columns=ordinal_pipeline.get_feature_names_out())

Unnamed: 0,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health
0,4.0,4.0,1.0,1.0,0.0,3.0,2.0,3.0,0.0,0.0,2.0
1,1.0,1.0,0.0,1.0,0.0,4.0,2.0,2.0,0.0,0.0,2.0
2,1.0,1.0,0.0,1.0,0.0,3.0,2.0,1.0,1.0,2.0,2.0
3,4.0,2.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,0.0,4.0
4,3.0,3.0,0.0,1.0,0.0,3.0,2.0,1.0,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
377,3.0,1.0,0.0,1.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0
378,1.0,1.0,0.0,1.0,0.0,3.0,2.0,2.0,0.0,0.0,4.0
379,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
380,3.0,1.0,1.0,0.0,0.0,1.0,3.0,4.0,2.0,3.0,1.0


###### Categorical Pipeline

In [97]:
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse=False))

In [98]:
cat = categorical_pipeline.fit_transform(X[categorical_cols])
test_cat = pd.DataFrame(cat,columns=categorical_pipeline.get_feature_names_out())

In [99]:
test_cat

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,...,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
378,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
379,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
380,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


###### Numerical Pipeline

In [100]:
numerical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),
                                   StandardScaler())

In [101]:
num = numerical_pipeline.fit_transform(X[numerical_cols])
test_num = pd.DataFrame(num,columns=numerical_pipeline.get_feature_names_out())

In [102]:
test_num

Unnamed: 0,age
0,1.206223
1,0.352932
2,-1.353650
3,-1.353650
4,-0.500359
...,...
377,1.206223
378,1.206223
379,1.206223
380,0.352932


###### Preprocessing Pipeline

In [103]:
preprocess = make_column_transformer((ordinal_pipeline, ordinal_cols),
                                     (categorical_pipeline, categorical_cols),
                                     (numerical_pipeline, numerical_cols),
                                     remainder='passthrough')
preprocess

In [104]:
test_dataf = preprocess.fit_transform(X)

pd.DataFrame(test_dataf, columns=preprocess.get_feature_names_out())

Unnamed: 0,pipeline-1__Medu,pipeline-1__Fedu,pipeline-1__traveltime,pipeline-1__studytime,pipeline-1__failures,pipeline-1__famrel,pipeline-1__freetime,pipeline-1__goout,pipeline-1__Dalc,pipeline-1__Walc,...,pipeline-2__Fjob_services,pipeline-2__Fjob_teacher,pipeline-2__reason_course,pipeline-2__reason_home,pipeline-2__reason_other,pipeline-2__reason_reputation,pipeline-2__guardian_father,pipeline-2__guardian_mother,pipeline-2__guardian_other,pipeline-3__age
0,4.0,4.0,1.0,1.0,0.0,3.0,2.0,3.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.206223
1,1.0,1.0,0.0,1.0,0.0,4.0,2.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.352932
2,1.0,1.0,0.0,1.0,0.0,3.0,2.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.353650
3,4.0,2.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-1.353650
4,3.0,3.0,0.0,1.0,0.0,3.0,2.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.500359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,3.0,1.0,0.0,1.0,0.0,3.0,2.0,3.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.206223
378,1.0,1.0,0.0,1.0,0.0,3.0,2.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.206223
379,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.206223
380,3.0,1.0,1.0,0.0,0.0,1.0,3.0,4.0,2.0,3.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.352932


###### Model reg
First, we will attempt to predict the exact grade using a linear regression model.

In [117]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

model_reg = make_pipeline(preprocess, RandomForestRegressor())
model_reg

Train Test Split for both y

In [106]:
X_train_mat, X_test_mat, y_train_mat, y_test_mat = train_test_split(X, y1, test_size=0.3, random_state=42)
X_train_por, X_test_por, y_train_por, y_test_por = train_test_split(X, y2, test_size=0.3, random_state=42)

print('MAT:')
print(X_train_mat.shape, X_test_mat.shape)
print(y_train_mat.shape, y_test_mat.shape)
print('')
print('POR:')
print(X_train_por.shape, X_test_por.shape)
print(y_train_por.shape, y_test_por.shape)


MAT:
(267, 29) (115, 29)
(267,) (115,)

POR:
(267, 29) (115, 29)
(267,) (115,)


Reassigning the model to the two targets.

In [118]:
model_mat = model_reg
model_por = model_reg

GridSearchCV 

In [109]:
params = {
    'randomforestregressor__n_estimators' : [100, 200, 500],
    'randomforestregressor__max_depth' : [None, 5, 10],
    'randomforestregressor__min_samples_split' : [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4]
}

grid_mat = GridSearchCV(model_mat, param_grid=params, cv=4)
grid_por = GridSearchCV(model_por, param_grid=params, cv=4)

In [110]:
grid_mat.fit(X_train_mat, y_train_mat)

In [111]:
grid_mat.best_params_

{'randomforestregressor__max_depth': 5,
 'randomforestregressor__min_samples_leaf': 2,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__n_estimators': 100}

In [112]:
grid_mat.score(X_test_mat,y_test_mat)

0.11816353020481096

In [114]:
grid_por.fit(X_train_por, y_train_por)

In [115]:
grid_por.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 10,
 'randomforestregressor__n_estimators': 100}

In [116]:
grid_por.score(X_test_por,y_test_por)

0.2639063919332302

**Our results are very poor. The nature of our variables seems to be one of the reasons for these low scores. Trying to improve these models appears to be futile given their performance. Therefore, we are considering converting our Y values into classes to switch to classification models.**

Modification of our Y values to binary.

In [119]:
def convert_to_binary(note):
    return 1 if note >= 10 else 0

In [121]:
y1_clf =y1.apply(convert_to_binary)
y2_clf =y2.apply(convert_to_binary)

New Train Test Split with new Y.

In [125]:
X_train_mat, X_test_mat, y_train_mat, y_test_mat = train_test_split(X, y1_clf, test_size=0.3, random_state=42)
X_train_por, X_test_por, y_train_por, y_test_por = train_test_split(X, y2_clf, test_size=0.3, random_state=42)

print('MAT:')
print(X_train_mat.shape, X_test_mat.shape)
print(y_train_mat.shape, y_test_mat.shape)
print('')
print('POR:')
print(X_train_por.shape, X_test_por.shape)
print(y_train_por.shape, y_test_por.shape)


MAT:
(267, 29) (115, 29)
(267,) (115,)

POR:
(267, 29) (115, 29)
(267,) (115,)


We readjust our pipelines to include classification models.

##### Classifier

In [126]:
from sklearn.ensemble import RandomForestClassifier

model_clf = make_pipeline(preprocess, RandomForestClassifier())
model_clf

In [None]:
mat_clf = model_clf
por_clf = model_clf

In [None]:
params = {
    'randomforestclassifier__n_estimators' : [100, 200, 500],
    'randomforestclassifier__max_depth' : [None, 5, 10],
    'randomforestclassifier__min_samples_split' : [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

grid_mat_clf = GridSearchCV(mat_clf, param_grid=params, cv=4, scoring='accuracy')
grid_por_clf = GridSearchCV(por_clf, param_grid=params, cv=4, scoring='accuracy')

###### Grid mat

In [None]:
grid_mat_clf.fit(X_train_mat, y_train_mat)

In [163]:
mat_params = grid_mat_clf.best_params_
mat_params

{'randomforestclassifier__max_depth': 5,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 500}

In [164]:
grid_mat_clf.score(X_test_mat, y_test_mat)

0.7304347826086957

###### Grid por

In [162]:
grid_por_clf.fit(X_train_por, y_train_por)

In [165]:
por_params = grid_por_clf.best_params_
por_params

{'randomforestclassifier__max_depth': None,
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 100}

In [166]:
grid_por_clf.score(X_test_por, y_test_por)

0.8956521739130435

### Final models

In [167]:
X_train, X_test, y_train_mat, y_test_mat = train_test_split(X, y1_clf, test_size=0.3, random_state=42)
X_train, X_test, y_train_por, y_test_por = train_test_split(X, y2_clf, test_size=0.3, random_state=42)

print('MAT:')
print(X_train_mat.shape, X_test_mat.shape)
print(y_train_mat.shape, y_test_mat.shape)
print('')
print('POR:')
print(X_train_por.shape, X_test_por.shape)
print(y_train_por.shape, y_test_por.shape)

MAT:
(267, 29) (115, 29)
(267,) (115,)

POR:
(267, 29) (115, 29)
(267,) (115,)


**We add best params:**

In [179]:
model_mat = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth=5,
                           min_samples_split=2,
                           min_samples_leaf=2,
                           n_estimators=500))

model_por = make_pipeline(
    preprocess,
    RandomForestClassifier(max_depth=None,
                           min_samples_split=2,
                           min_samples_leaf=1,
                           n_estimators=100))

In [186]:
clf_mat = model_mat.fit(X_train, y_train_mat)

In [187]:
clf_mat.score(X_test, y_test_mat)

0.7304347826086957

In [240]:
clf_por = model_por.fit(X_train, y_train_por)

In [241]:
clf_por.score(X_test, y_test_por)

0.8956521739130435

**We obtain very good results with classification models, unlike regression models.**

**Other treatments can be performed to improve this score, but we will reserve them for another time. Some examples of improvements include Feature Selection, testing more model parameters, and so on.**

**Thanks for reading !**