### Creating and Persisting an ML Model

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [2]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [3]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

Create a subset of features as an example.

In [4]:
include = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup','G3']
df.drop(columns=df.columns.difference(include), inplace=True)

In [5]:
df.info

<bound method DataFrame.info of      Medu  Fedu  studytime  failures schoolsup famsup paid internet  absences  \
0       4     4          2         0       yes     no   no       no         6   
1       1     1          2         0        no    yes   no      yes         4   
2       1     1          2         3       yes     no  yes      yes        10   
3       4     2          3         0        no    yes  yes      yes         2   
4       3     3          2         0        no    yes  yes       no         4   
..    ...   ...        ...       ...       ...    ...  ...      ...       ...   
390     2     2          2         2        no    yes  yes       no        11   
391     3     1          1         0        no     no   no      yes         3   
392     1     1          1         3        no     no   no       no         3   
393     3     2          1         0        no     no   no      yes         0   
394     1     1          1         0        no     no   no      yes         5

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Because we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [6]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [7]:
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)

In [8]:
df['paid'] = np.where(df['paid']=='yes', 1, 0)

In [9]:
df['internet'] = np.where(df['internet']=='yes', 1, 0)

In [10]:
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

In [11]:
df.describe()

Unnamed: 0,Medu,Fedu,studytime,failures,schoolsup,famsup,paid,internet,absences,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,2.749367,2.521519,2.035443,0.334177,0.129114,0.612658,0.458228,0.832911,5.708861,10.41519,0.18481
std,1.094735,1.088201,0.83924,0.743651,0.335751,0.487761,0.498884,0.373528,8.003096,4.581443,0.388636
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0
50%,3.0,2.0,2.0,0.0,0.0,1.0,0.0,1.0,4.0,11.0,0.0
75%,4.0,3.0,2.0,0.0,0.0,1.0,1.0,1.0,8.0,14.0,0.0
max,4.0,4.0,4.0,3.0,1.0,1.0,1.0,1.0,75.0,20.0,1.0


Drop the G3 score

In [12]:
include = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup','qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

Import scikit-learn and build a random forest classifer

In [13]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
from sklearn.model_selection import train_test_split
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]
rfc = rf(criterion='gini', 
                             n_estimators=5000,
                             max_depth=7,
                             min_samples_split=6,
                             min_samples_leaf=6,
                             max_features='auto',
                             oob_score=True,
                             random_state=42,
                             n_jobs=-1,
                             verbose=1)
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.3)
rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

  warn(
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   13.3s finished


****Results****


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    0.6s


Accuracy: 83.1933%


[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 5000 out of 5000 | elapsed:    0.9s finished


Let's export this model so we can use it in a microservice (flask api)

In [14]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(rfc, 'app/handlers/model.pkl')

['app/handlers/model.pkl']

In [15]:
query_df = pd.DataFrame({ 'failures' : pd.Series(0) ,'schoolsup' : pd.Series(1) ,'internet' : pd.Series(1), 'studytime' : pd.Series(4), 'absences' : pd.Series(0), 'Medu' : pd.Series(4), 'Fedu' : pd.Series(4), 'paid' : pd.Series(1), 'famsup' : pd.Series(0)})

In [16]:
pred = rfc.predict(query_df)
print(pred)

Feature names must be in the same order as they were in fit.

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    0.6s


[0]


[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 5000 out of 5000 | elapsed:    1.0s finished


In [17]:
x

Unnamed: 0,Fedu,Medu,absences,failures,famsup,internet,paid,schoolsup,studytime
0,4,4,6,0,0,0,0,1,2
1,1,1,4,0,1,1,0,0,2
2,1,1,10,3,0,1,1,1,2
3,2,4,2,0,1,1,1,0,3
4,3,3,4,0,1,0,1,0,2
...,...,...,...,...,...,...,...,...,...
390,2,2,11,2,1,0,1,0,2
391,1,3,3,0,0,1,0,0,1
392,1,1,3,3,0,0,0,0,1
393,2,3,0,0,0,1,0,0,1


In [18]:
type(x)

pandas.core.frame.DataFrame