### Creating and Persisting an ML Model

In [85]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [86]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


Create a subset of features as an example.

In [88]:
include = ['studytime', 'reason','activities','absences', 'higher', 'traveltime', 'failures', 'Dalc', 'Walc', 'G3']
df.drop(columns=df.columns.difference(include), inplace=True)

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [89]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [90]:
df.describe()

Unnamed: 0,traveltime,studytime,failures,Dalc,Walc,absences,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,1.448101,2.035443,0.334177,1.481013,2.291139,5.708861,10.41519,0.18481
std,0.697505,0.83924,0.743651,0.890741,1.287897,8.003096,4.581443,0.388636
min,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,1.0,1.0,0.0,8.0,0.0
50%,1.0,2.0,0.0,1.0,2.0,4.0,11.0,0.0
75%,2.0,2.0,0.0,2.0,3.0,8.0,14.0,0.0
max,4.0,4.0,3.0,5.0,5.0,75.0,20.0,1.0


Drop the G3 score


In [91]:
df.drop(columns = ['G3'], inplace = True)
df = pd.get_dummies(df)
df

Unnamed: 0,traveltime,studytime,failures,Dalc,Walc,absences,qual_student,reason_course,reason_home,reason_other,reason_reputation,activities_no,activities_yes,higher_no,higher_yes
0,2,2,0,1,1,6,0,1,0,0,0,1,0,0,1
1,1,2,0,1,1,4,0,1,0,0,0,1,0,0,1
2,1,2,3,2,3,10,0,0,0,1,0,1,0,0,1
3,1,3,0,1,1,2,1,0,1,0,0,0,1,0,1
4,1,2,0,1,2,4,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,2,2,4,5,11,0,1,0,0,0,1,0,0,1
391,2,1,0,3,4,3,1,1,0,0,0,1,0,0,1
392,1,1,3,3,3,3,0,1,0,0,0,1,0,0,1
393,3,1,0,3,4,0,0,1,0,0,0,1,0,0,1


Import scikit-learn and build a random forest classifer

In [154]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]
clf = rf(n_estimators = 1000)
clf.fit(x, y)

In [155]:
import sklearn

pred = clf.predict(x)
sklearn.metrics.f1_score(y, pred, average='binary')

0.8776978417266188

Save our model

In [None]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'app/handlers/model.pkl')

Example Usage

In [156]:
query_df = pd.DataFrame({
    "Dalc": pd.Series(0),
    "Walc": pd.Series(0),
    "absences":pd.Series(3),
    "activities_no": pd.Series(0),
    "activities_yes": pd.Series(1),
    "failures": pd.Series(0),
    "higher_no": pd.Series(0),
    "higher_yes": pd.Series(1),
    "reason_course": pd.Series(0),
    "reason_home": pd.Series(1),
    "reason_other": pd.Series(0),
    "reason_reputation": pd.Series(0),
    "studytime": pd.Series(5),
    "traveltime": pd.Series(0)
    })

In [157]:
query_df = pd.get_dummies(query_df)


In [158]:
pred = clf.predict_proba(query_df)
pred

array([[0.215, 0.785]])

In [151]:
pred = clf.predict(query_df)
pred

array([1])