# Creating the ML Model for CMU’s admission system

## Load the data

In [18]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


## Preprocess the data
### Remove unneeded attributes

In [19]:
# We chose to remove all attributtes not relevant to the CMU setting.
# `school` is specific to the source data set, and the grades are also
# not relevant (we keep the last grade to know the expected prediction value)
df.drop(columns=['school', 'G1', 'G2'], inplace=True)
df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,F,18,U,GT3,A,4,4,at_home,teacher,course,...,no,no,4,3,4,1,1,3,6,6
1,F,17,U,GT3,T,1,1,at_home,other,course,...,yes,no,5,3,3,1,1,3,4,6
2,F,15,U,LE3,T,1,1,at_home,other,other,...,yes,no,4,3,2,2,3,3,10,10
3,F,15,U,GT3,T,4,2,health,services,home,...,yes,yes,3,2,2,1,1,5,2,15
4,F,16,U,GT3,T,3,3,other,other,home,...,no,no,4,3,2,1,2,5,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,M,20,U,LE3,A,2,2,services,services,course,...,no,no,5,5,4,4,5,4,11,9
391,M,17,U,LE3,T,3,1,services,services,course,...,yes,no,2,4,5,3,4,2,3,16
392,M,21,R,GT3,T,1,1,other,other,course,...,no,no,5,5,3,3,3,3,3,7
393,M,18,R,LE3,T,3,2,services,other,course,...,yes,no,4,4,1,3,4,5,0,10


### Convert binary attributes to numerical attributes

In [20]:
df['sex'] = np.where(df['sex'] == 'F', 1, 0)
df['address'] = np.where(df['address'] == 'U', 1, 0)
df['famsize'] = np.where(df['famsize'] == 'LE3', 1, 0)
df['Pstatus'] = np.where(df['Pstatus'] == 'T', 1, 0)

for binary_attribute in ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']:
    df[binary_attribute] = np.where(df[binary_attribute] == 'yes', 1, 0)

df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,1,18,1,0,0,4,4,at_home,teacher,course,...,0,0,4,3,4,1,1,3,6,6
1,1,17,1,0,1,1,1,at_home,other,course,...,1,0,5,3,3,1,1,3,4,6
2,1,15,1,1,1,1,1,at_home,other,other,...,1,0,4,3,2,2,3,3,10,10
3,1,15,1,0,1,4,2,health,services,home,...,1,1,3,2,2,1,1,5,2,15
4,1,16,1,0,1,3,3,other,other,home,...,0,0,4,3,2,1,2,5,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,20,1,1,0,2,2,services,services,course,...,0,0,5,5,4,4,5,4,11,9
391,0,17,1,1,1,3,1,services,services,course,...,1,0,2,4,5,3,4,2,3,16
392,0,21,0,0,1,1,1,other,other,course,...,0,0,5,5,3,3,3,3,3,7
393,0,18,0,1,1,3,2,services,other,course,...,1,0,4,4,1,3,4,5,0,10


### Convert nominal attributes to numerical attributes

In [21]:
categories = {
    'Mjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'Fjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'reason': ['home', 'reputation', 'course', 'other'],
    'guardian': ['mother', 'father', 'other'],
}

for category in categories:
    values = categories[category]
    df[category] = df[category].map(values.index)

df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,1,18,1,0,0,4,4,3,0,2,...,0,0,4,3,4,1,1,3,6,6
1,1,17,1,0,1,1,1,3,4,2,...,1,0,5,3,3,1,1,3,4,6
2,1,15,1,1,1,1,1,3,4,3,...,1,0,4,3,2,2,3,3,10,10
3,1,15,1,0,1,4,2,1,2,0,...,1,1,3,2,2,1,1,5,2,15
4,1,16,1,0,1,3,3,4,4,0,...,0,0,4,3,2,1,2,5,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,20,1,1,0,2,2,2,2,2,...,0,0,5,5,4,4,5,4,11,9
391,0,17,1,1,1,3,1,2,2,2,...,1,0,2,4,5,3,4,2,3,16
392,0,21,0,0,1,1,1,4,4,2,...,0,0,5,5,3,3,3,3,3,7
393,0,18,0,1,1,3,2,2,4,2,...,1,0,4,4,1,3,4,5,0,10


### Create the prediction column

In [22]:
df['qual_student'] = np.where(df['G3'] >= 15, 1, 0)
df.drop(columns=['G3'], inplace=True)

df

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,qual_student
0,1,18,1,0,0,4,4,3,0,2,...,0,0,4,3,4,1,1,3,6,0
1,1,17,1,0,1,1,1,3,4,2,...,1,0,5,3,3,1,1,3,4,0
2,1,15,1,1,1,1,1,3,4,3,...,1,0,4,3,2,2,3,3,10,0
3,1,15,1,0,1,4,2,1,2,0,...,1,1,3,2,2,1,1,5,2,1
4,1,16,1,0,1,3,3,4,4,0,...,0,0,4,3,2,1,2,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,20,1,1,0,2,2,2,2,2,...,0,0,5,5,4,4,5,4,11,0
391,0,17,1,1,1,3,1,2,2,2,...,1,0,2,4,5,3,4,2,3,1
392,0,21,0,0,1,1,1,4,4,2,...,0,0,5,5,3,3,3,3,3,0
393,0,18,0,1,1,3,2,2,4,2,...,1,0,4,4,1,3,4,5,0,0


## Training the model
### Create train/test sets

In [None]:
import sklearn
import sklearn.model_selection

dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=1)

In [None]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics

# Total time spent on training models
total_time_sec = 60

# Maximum time for some model
time_per_job_sec = 30

# Number of parallel jobs
n_jobs = 7

automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=total_time_sec, per_run_time_limit=time_per_job_sec, n_jobs=n_jobs)
automl.fit(X_train, y_train)

## Test the model accuracy

In [None]:
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

## Exporting the model

In [None]:
import joblib
joblib.dump(automl, 'app/handlers/model.pkl')