### Creating and Persisting an ML Model

In [216]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [217]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [218]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

Create a subset of features as an example.

In [219]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [220]:
# Initial Baseline Model
# include = ['health', 'absences','age','G3']

# New Model
include = ['G1', 'G2', 'studytime','health', 'absences','age','G3']
df.drop(columns=df.columns.difference(include), inplace=True)

In [221]:
df.info

<bound method DataFrame.info of      age  studytime  health  absences  G1  G2  G3
0     18          2       3         6   5   6   6
1     17          2       3         4   5   5   6
2     15          2       3        10   7   8  10
3     15          3       5         2  15  14  15
4     16          2       5         4   6  10  10
..   ...        ...     ...       ...  ..  ..  ..
390   20          2       4        11   9   9   9
391   17          1       2         3  14  16  16
392   21          1       3         3  10   8   7
393   18          1       5         0  11  12  10
394   19          1       5         5   8   9   9

[395 rows x 7 columns]>

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [222]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [223]:
df.describe()

Unnamed: 0,age,studytime,health,absences,G1,G2,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.035443,3.55443,5.708861,10.908861,10.713924,10.41519,0.18481
std,1.276043,0.83924,1.390303,8.003096,3.319195,3.761505,4.581443,0.388636
min,15.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0
25%,16.0,1.0,3.0,0.0,8.0,9.0,8.0,0.0
50%,17.0,2.0,4.0,4.0,11.0,11.0,11.0,0.0
75%,18.0,2.0,5.0,8.0,13.0,13.0,14.0,0.0
max,22.0,4.0,5.0,75.0,19.0,19.0,20.0,1.0


In [224]:
df.columns

Index(['age', 'studytime', 'health', 'absences', 'G1', 'G2', 'G3',
       'qual_student'],
      dtype='object')

Drop the G3 score

In [225]:
df.drop(columns=['G3'], inplace=True)

Import scikit-learn and build a random forest classifer

In [226]:
from sklearn.ensemble import RandomForestClassifier
import sklearn
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

#splitting the train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.3)

np.random.seed(2)

clf = RandomForestClassifier(n_estimators = 1000)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [227]:
pred = clf.predict(X_test)
sklearn.metrics.f1_score(y_test, pred, average='binary')

0.9583333333333334

It's not very good! We didn't even cross validate. You'll need to do better :)
Let's export this model so we can use it in a microservice (flask api)

Conduct cross validation to tune Random Forest parameters

In [228]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'max_features': max_features,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [229]:
rf_random.best_params_

{'n_estimators': 2000,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [241]:
import sklearn
best_random = rf_random.best_estimator_
pred_cv = best_random.predict(X_test)
sklearn.metrics.f1_score(y_test, pred_cv, average='binary')

0.9583333333333334

In [242]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'dockerfile/apps/model.pkl')

['dockerfile/apps/model.pkl']

In [252]:
query_df = pd.DataFrame({'G1' : pd.Series(2), 'G2' : pd.Series(2), 'absences' : pd.Series(10), 'age' : pd.Series(1) ,'health' : pd.Series(15) , 'studytime': pd.Series(2)})

In [253]:
pred = clf.predict(query_df)

In [250]:
x

Unnamed: 0,G1,G2,absences,age,health,studytime
0,5,6,6,18,3,2
1,5,5,4,17,3,2
2,7,8,10,15,3,2
3,15,14,2,15,5,3
4,6,10,4,16,5,2
...,...,...,...,...,...,...
390,9,9,11,20,4,2
391,14,16,3,17,2,1
392,10,8,3,21,3,1
393,11,12,0,18,5,1


In [255]:
type(x)

pandas.core.frame.DataFrame