### Creating and Persisting an ML Model

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [2]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [3]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

In [4]:
include = ['G1', 'G2', 'G3', 'studytime', 'activities', 'absences', 'Dalc']
df.drop(columns=df.columns.difference(include), inplace=True)  # only using 3 features

The goal is to predict the quality of the student. We will build a predictor based on 'reason','studytime', 'activities', 'Dalc', 'freetime', 'absences', 'G1', 'G2', 'G3'.
Becasue we are trying to find quality students.

In [5]:
#df['qual_student'] = np.where((df['G3']>=15) & (df['G2'] >= 15) & (df['G1'] >= 15) & (df['studytime'] >= 3) & (df['activities'] == 'yes') & (df['absences'] <= 5), 1, 0)

#making standards lower so there will be more "qual_student"s
df['qual_student'] = np.where((df['G3']>=15) & (df['studytime'] >= 2) & (df['activities'] == 'yes') & (df['absences'] <= 50), 1, 0)


In [6]:
df.describe()
display(df)

Unnamed: 0,studytime,activities,Dalc,absences,G1,G2,G3,qual_student
0,2,no,1,6,5,6,6,0
1,2,no,1,4,5,5,6,0
2,2,no,2,10,7,8,10,0
3,3,yes,1,2,15,14,15,1
4,2,no,1,4,6,10,10,0
...,...,...,...,...,...,...,...,...
390,2,no,4,11,9,9,9,0
391,1,no,3,3,14,16,16,0
392,1,no,3,3,10,8,7,0
393,1,no,3,0,11,12,10,0


Drop all the columns we care about

In [7]:
include = ['G1', 'G2','Dalc','qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

Import scikit-learn and build a random forest classifer

In [8]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
x = df[df.columns.difference(['qual_student'])]
y = df['qual_student']
x = pd.get_dummies(x)
clf = rf(n_estimators = 1000)
clf.fit(x, y)

RandomForestClassifier(n_estimators=1000)

In [9]:
display(x)

Unnamed: 0,Dalc,G1,G2
0,1,5,6
1,1,5,5
2,2,7,8
3,1,15,14
4,1,6,10
...,...,...,...
390,4,9,9
391,3,14,16
392,3,10,8
393,3,11,12


In [10]:
pred = clf.predict(x)
sklearn.metrics.f1_score(y, pred, average='binary')

0.7777777777777777

Let's export this model so we can use it in a microservice (flask api)

In [11]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'app/handlers/model.pkl')

['app/handlers/model.pkl']

In [48]:
query_df = pd.DataFrame({'Dalc' : pd.Series(5), 'G1' : pd.Series(20) ,'G2' : pd.Series(20)})


In [49]:
pred = clf.predict(query_df)
pred

array([0])

In [45]:
x

Unnamed: 0,Dalc,G1,G2
0,1,5,6
1,1,5,5
2,2,7,8
3,1,15,14
4,1,6,10
5,1,15,15
6,1,12,12
7,1,6,5
8,1,16,18
9,1,14,15


In [46]:
type(x)

pandas.core.frame.DataFrame