# Creating the ML Model for CMU’s admission system

## Load the data

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')
df

## Preprocess the data
### Remove unneeded attributes

In [None]:
# We chose to remove all attributtes not relevant to the CMU setting.
# `school` is specific to the source data set, and the grades are also
# not relevant (we keep the last grade to know the expected prediction value)
df.drop(columns=['school', 'G1', 'G2'], inplace=True)
df

### Convert binary attributes to numerical attributes

In [None]:
df['sex'] = np.where(df['sex'] == 'F', 1, 0)
df['address'] = np.where(df['address'] == 'U', 1, 0)
df['famsize'] = np.where(df['famsize'] == 'LE3', 1, 0)
df['Pstatus'] = np.where(df['Pstatus'] == 'T', 1, 0)

for binary_attribute in ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']:
    df[binary_attribute] = np.where(df[binary_attribute] == 'yes', 1, 0)

df

### Convert nominal attributes to numerical attributes

In [None]:
categories = {
    'Mjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'Fjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'reason': ['home', 'reputation', 'course', 'other'],
    'guardian': ['mother', 'father', 'other'],
}

for category in categories:
    values = categories[category]
    df[category] = df[category].map(values.index)

df

### Create the prediction column

In [None]:
df['qual_student'] = np.where(df['G3'] >= 15, 1, 0)
df.drop(columns=['G3'], inplace=True)

df

## Training the model
### Create train/test sets

In [None]:
import sklearn
import sklearn.model_selection

dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=1)

In [None]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics

# Total time spent on training models
total_time_sec = 60

# Maximum time for some model
time_per_job_sec = 30

# Number of parallel jobs
n_jobs = 7

automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=total_time_sec, per_run_time_limit=time_per_job_sec, n_jobs=n_jobs)
automl.fit(X_train, y_train)

## Test the model accuracy

In [None]:
print(automl.leaderboard())

In [None]:
print(automl.show_models())

In [None]:
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

## Exporting the model

In [None]:
import joblib
joblib.dump(automl, 'app/handlers/model.pkl')