## Load data

In [1]:
# Load the original model
import joblib
model_original = joblib.load('app/handlers/model.pkl')

In [2]:
# Load the production data
import pandas as pd
import numpy as np
df = pd.read_csv('prod.csv', sep=',')
df
# We chose to remove all attributtes not relevant to the CMU setting.
# `school` is specific to the source data set, and the grades are also
# not relevant (we keep the last grade to know the expected prediction value)
df.drop(columns=['school', 'G1', 'G2'], inplace=True)
df

df['sex'] = np.where(df['sex'] == 'F', 1, 0)
df['address'] = np.where(df['address'] == 'U', 1, 0)
df['famsize'] = np.where(df['famsize'] == 'LE3', 1, 0)
df['Pstatus'] = np.where(df['Pstatus'] == 'T', 1, 0)

for binary_attribute in ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']:
    df[binary_attribute] = np.where(df[binary_attribute] == 'yes', 1, 0)

df

categories = {
    'Mjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'Fjob': ['teacher', 'health', 'services', 'at_home', 'other'],
    'reason': ['home', 'reputation', 'course', 'other'],
    'guardian': ['mother', 'father', 'other'],
}

for category in categories:
    values = categories[category]
    df[category] = df[category].map(values.index)

df

df['qual_student'] = np.where(df['G3'] >= 15, 1, 0)
df.drop(columns=['G3'], inplace=True)

df


dependent_variable = 'qual_student'
x_raw = df[df.columns.difference([dependent_variable])]
x = x_raw.to_numpy()
y = df[dependent_variable].to_numpy()


## Statistics for the original model

In [3]:
y_hat = model_original.predict(x)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:    0.2s
[Parallel(n_jobs=7)]: Done 1236 tasks      | elapsed:    0.3s
[Parallel(n_jobs=7)]: Done 1786 tasks      | elapsed:    0.4s
[Parallel(n_jobs=7)]: Done 2436 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 3186 tasks      | elapsed:    0.7s
[Parallel(n_jobs=7)]: Done 4036 tasks      | elapsed:    0.9s
[Parallel(n_jobs=7)]: Done 4986 tasks      | elapsed:    1.1s
[Parallel(n_jobs=7)]: Done 6036 tasks      | elapsed:    1.3s
[Parallel(n_jobs=7)]: Done 7186 tasks      | elapsed:    1.5s
[Parallel(n_jobs=7)]: Done 8436 tasks      | elapsed:    1.8s
[Parallel(n_jobs=7)]: Done 9786 tasks      | elapsed:    2.1s
[Parallel(n_jobs=7)]: Done 11236 tasks      | elapsed:  

In [7]:
import sklearn

print("Accuracy score", sklearn.metrics.accuracy_score(y, y_hat))
print("Precision score", sklearn.metrics.precision_score(y, y_hat))
print("Recall score", sklearn.metrics.recall_score(y, y_hat))

Accuracy score 0.84705
Precision score 0.9364161849710982
Recall score 0.05046728971962617


## Train the second iteration of the model

In [8]:
import sklearn
import sklearn.model_selection
from sklearn.ensemble import RandomForestClassifier

dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, random_state=1)

n_jobs = 7
model_new = RandomForestClassifier(criterion='gini', n_estimators=11750, max_depth=7, min_samples_split=6,
                               min_samples_leaf=6, max_features='auto', oob_score=True, random_state=42, n_jobs=7, verbose=1)
model_new.fit(X_train, y_train)


  warn(
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.2s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:    0.9s
[Parallel(n_jobs=7)]: Done 1236 tasks      | elapsed:    1.4s
[Parallel(n_jobs=7)]: Done 1786 tasks      | elapsed:    2.1s
[Parallel(n_jobs=7)]: Done 2436 tasks      | elapsed:    2.8s
[Parallel(n_jobs=7)]: Done 3186 tasks      | elapsed:    3.6s
[Parallel(n_jobs=7)]: Done 4036 tasks      | elapsed:    4.5s
[Parallel(n_jobs=7)]: Done 4986 tasks      | elapsed:    5.6s
[Parallel(n_jobs=7)]: Done 6036 tasks      | elapsed:    6.8s
[Parallel(n_jobs=7)]: Done 7186 tasks      | elapsed:    8.1s
[Parallel(n_jobs=7)]: Done 8436 tasks      | elapsed:    9.5s
[Parallel(n_jobs=7)]: Done 9786 tasks      | elapsed:   11.3s
[Parallel(n_jobs=7)]: Done 11236 tasks      | el

## Statistics for the second iteration of the model

In [9]:
y_hat = model_original.predict(X_test)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 1236 tasks      | elapsed:    0.2s
[Parallel(n_jobs=7)]: Done 1786 tasks      | elapsed:    0.3s
[Parallel(n_jobs=7)]: Done 2436 tasks      | elapsed:    0.4s
[Parallel(n_jobs=7)]: Done 3186 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 4036 tasks      | elapsed:    0.6s
[Parallel(n_jobs=7)]: Done 4986 tasks      | elapsed:    0.8s
[Parallel(n_jobs=7)]: Done 6036 tasks      | elapsed:    0.9s
[Parallel(n_jobs=7)]: Done 7186 tasks      | elapsed:    1.1s
[Parallel(n_jobs=7)]: Done 8436 tasks      | elapsed:    1.3s
[Parallel(n_jobs=7)]: Done 9786 tasks      | elapsed:    1.5s
[Parallel(n_jobs=7)]: Done 11236 tasks      | elapsed:  

In [13]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
print("Precision score", sklearn.metrics.precision_score(y_test, y_hat))
print("Recall score", sklearn.metrics.recall_score(y_test, y_hat))


Accuracy score 0.8456
Precision score 0.9534883720930233
Recall score 0.05055487053020962
