### Creating and Persisting an ML Model

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd
import numpy as np
data = pd.read_csv('data/student-mat.csv', sep=';')
train = data
test = pd.read_csv('data/ProductionData.csv', sep=',')

Summary of the data

In [2]:
train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


Create a subset of features as an example.

In [3]:
test.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,M,16,U,LE3,T,3,2,services,other,...,3,3,2,4,5,5,7,13,14,14
1,GP,F,15,R,GT3,T,2,2,other,health,...,5,4,5,1,1,4,7,8,8,9
2,GP,M,18,U,GT3,T,4,2,teacher,other,...,4,2,2,1,2,1,2,16,15,15
3,GP,M,16,U,GT3,T,1,1,at_home,other,...,5,2,3,1,2,2,1,13,13,13
4,GP,M,19,U,GT3,A,3,3,services,services,...,3,3,1,1,4,3,2,12,12,12


In [4]:
include = ['health', 'absences','age', 'failures', 'Dalc', 'internet', 'paid', 'higher',
           'studytime', 'address','G3']
train.drop(columns=train.columns.difference(include), inplace=True) 
test.drop(columns=test.columns.difference(include), inplace=True) 

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [5]:
train['qual_student'] = np.where(train['G3']>=15, 1, 0)
test['qual_student'] = np.where(test['G3']>=15, 1, 0)

In [6]:
train['higher_int'] = np.where(train['higher'] == "yes", 1, 0)
test['higher_int'] = np.where(test['higher'] == "yes", 1, 0)

train['paid_int'] = np.where(train['paid'] == "yes", 1, 0)
test['paid_int'] = np.where(test['paid'] == "yes", 1, 0)

train['internet_int'] = np.where(train['internet'] == "yes", 1, 0)
test['internet_int'] = np.where(test['internet'] == "yes", 1, 0)

train['address_int'] = np.where(train['address'] == "U", 1, 0)
test['address_int'] = np.where(test['address'] == "yes", 1, 0)

train['higher_int'] = np.where(train['higher'] == "yes", 1, 0)
test['higher_int'] = np.where(test['higher'] == "yes", 1, 0)

In [7]:
train.head()

Unnamed: 0,age,address,studytime,failures,paid,higher,internet,Dalc,health,absences,G3,qual_student,higher_int,paid_int,internet_int,address_int
0,18,U,2,0,no,yes,no,1,3,6,6,0,1,0,0,1
1,17,U,2,0,no,yes,yes,1,3,4,6,0,1,0,1,1
2,15,U,2,3,yes,yes,yes,2,3,10,10,0,1,1,1,1
3,15,U,3,0,yes,yes,yes,1,5,2,15,1,1,1,1,1
4,16,U,2,0,yes,yes,no,1,5,4,10,0,1,1,0,1


Drop the G3 score

In [12]:
include = ['health', 'absences','age', 'failures', 'Dalc', 'internet_int', 'higher_int', 'paid_int',
           'studytime','address_int','qual_student']
train.drop(columns=train.columns.difference(include), inplace=True)
test.drop(columns=test.columns.difference(include), inplace=True)

Import scikit-learn and build a random forest classifer

In [14]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
dependent_variable = 'qual_student'
X_train = train[train.columns.difference([dependent_variable])]
Y_train = train[dependent_variable]

X_test = test[test.columns.difference([dependent_variable])]
Y_test = test[dependent_variable]

clf = rf(n_estimators = 1000)
clf.fit(X_train, Y_train)

In [15]:
pred = clf.predict(X_test)
sklearn.metrics.f1_score(Y_test, pred, average='binary')

0.1509433962264151

In [16]:
import lime
import lime.lime_tabular

In [26]:
predict_fn_rf = lambda x: clf.predict_proba(x).astype(float)
X = X_train.values
explainer = lime.lime_tabular.LimeTabularExplainer(X, feature_names = X_train.columns, class_names=['Not Qualified','Qualified'])

In [30]:
test = pd.concat([X_test, Y_test.to_frame()], axis=1)
test.head(10)

Unnamed: 0,Dalc,absences,address_int,age,failures,health,higher_int,internet_int,paid_int,studytime,qual_student
0,4,7,0,16,0,5,1,1,1,2,0
1,1,7,0,15,0,4,1,1,0,1,0
2,1,2,0,18,0,1,1,1,1,3,1
3,1,1,0,16,0,2,1,1,1,1,0
4,1,2,0,19,0,3,1,1,1,2,0
5,1,13,0,17,0,5,1,1,1,2,0
6,1,7,0,18,0,3,1,1,0,1,1
7,1,9,0,18,0,2,1,1,0,1,0
8,2,2,0,17,0,5,1,1,0,2,0
9,1,2,0,18,3,5,1,1,0,2,0


In [28]:
test.loc[[421]]

Unnamed: 0,Dalc,absences,address_int,age,failures,health,higher_int,internet_int,paid_int,studytime,qual_student
421,1,11,0,18,1,4,1,1,0,2,0


In [29]:
chosen_instance = test.loc[[421]].values[0]
exp = explainer.explain_instance(chosen_instance, predict_fn_rf, num_features=10)
exp.show_in_notebook(show_all=False)

KeyError: 10

It's not very good! We didn't even cross validate. You'll need to do better :)
Let's export this model so we can use it in a microservice (flask api)

In [83]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'app/handlers/model.pkl')

['app/handlers/model.pkl']

In [88]:
query_df = pd.DataFrame({'health' : pd.Series(15) ,'absences' : pd.Series(10), 'age' : pd.Series(1) , 'failures': pd.Series(1), 'Dalc': pd.Series(1), 'internet_int': pd.Series(1), 'higher_int': pd.Series(1), 'paid_int': pd.Series(1),
           'studytime': pd.Series(1),'address_int': pd.Series(1)})

In [89]:
pred = clf.predict(query_df)

Feature names must be in the same order as they were in fit.



In [90]:
x

Unnamed: 0,Dalc,absences,address_int,age,failures,health,higher_int,internet_int,paid_int,studytime
0,1,6,1,18,0,3,1,0,0,2
1,1,4,1,17,0,3,1,1,0,2
2,2,10,1,15,3,3,1,1,1,2
3,1,2,1,15,0,5,1,1,1,3
4,1,4,1,16,0,5,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...
390,4,11,1,20,2,4,1,0,1,2
391,3,3,1,17,0,2,1,1,0,1
392,3,3,0,21,3,3,1,0,0,1
393,3,0,0,18,0,5,1,1,0,1


In [87]:
type(x)

pandas.core.frame.DataFrame