# Creating the ML Model for CMU’s admission system

## Load the data

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')
df.info

## Preprocess the data
### Remove unneeded attributes

In [None]:
# We chose to remove all attributtes not relevant to the CMU setting.
# `school` is specific to the source data set, and the grades are also
# not relevant (we keep the last grade to know the expected prediction value)
df.drop(columns=['school', 'G1', 'G2'], inplace=True)
df.info

### Fix the format of some attributes

In [None]:
# @TODO yes/no attributes (16-23)

### Create the prediction column

In [None]:
df['qual_student'] = np.where(df['G3'] >= 15, 1, 0)
df.drop(columns=['G3'], inplace=True) 

## Training the model
### Create train/test sets

In [None]:
import sklearn
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=1)

In [None]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

# Total time spent on training models
total_time_sec = 60 * 60

# Maximum time for some model
time_per_job_sec = 30

# Number of parallel jobs
n_jobs = 7

automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=total_time_sec, per_run_time_limit=time_per_job_sec, n_jobs=n_jobs)
automl.fit(X_train, y_train)

## Test the model accuracy

In [None]:
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

## Exporting the model

In [None]:
import joblib
joblib.dump(automl, 'app/handlers/model.pkl')