### Creating and Persisting an ML Model

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [None]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [None]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

Create a subset of features as an example.

In [None]:
# include = ['health', 'absences','age']  
# => ~75% accuracy @ ~11% f1
# include = ['age', 'Medu', 'Fedu', 'studytime', 'failures', 'schoolsup', 'activities', 'higher', 'freetime', 'goout', 'Walc', 'health', 'absences'] 
# => ~80% accuracy @ ~18% f1
# include = ['school', 'age', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'higher', 'freetime', 'Dalc', 'Walc', 'health', 'absences']
# => ~80% accuracy @ ~20% f1

include = ['school', 'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'activities', 'higher', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

# drop whatever is not in the three features!
include.append('G3')
df.drop(columns=df.columns.difference(include), inplace=True)  # only using 3 features

In [None]:
df.info

<bound method DataFrame.info of     school  age  Medu  Fedu  traveltime  studytime  failures schoolsup famsup  \
0       GP   18     4     4           2          2         0       yes     no   
1       GP   17     1     1           1          2         0        no    yes   
2       GP   15     1     1           1          2         3       yes     no   
3       GP   15     4     2           1          3         0        no    yes   
4       GP   16     3     3           1          2         0        no    yes   
..     ...  ...   ...   ...         ...        ...       ...       ...    ...   
390     MS   20     2     2           1          2         2        no    yes   
391     MS   17     3     1           2          1         0        no     no   
392     MS   21     1     1           1          1         3        no     no   
393     MS   18     3     2           3          1         0        no     no   
394     MS   19     1     1           1          1         0        no     no

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [None]:
# a "quality student" is one whose G3 score is >= 15 (that is what we want to predict using other fields)
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [None]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,freetime,goout,Dalc,Walc,health,absences,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.41519,0.18481
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,4.581443,0.388636
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,0.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,0.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,4.0,4.0,2.0,3.0,5.0,8.0,14.0,0.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,75.0,20.0,1.0


In [None]:
include = ['school', 'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'activities', 'higher', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


# by doing inclusion w/out the G3 we are able to remove G3 from it (shouldn't be considered)
include.append('qual_student')
df.drop(columns=df.columns.difference(include), inplace=True) 
df.describe()

Unnamed: 0,age,traveltime,studytime,failures,freetime,Dalc,Walc,health,absences,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,1.448101,2.035443,0.334177,3.235443,1.481013,2.291139,3.55443,5.708861,0.18481
std,1.276043,0.697505,0.83924,0.743651,0.998862,0.890741,1.287897,1.390303,8.003096,0.388636
min,15.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,16.0,1.0,1.0,0.0,3.0,1.0,1.0,3.0,0.0,0.0
50%,17.0,1.0,2.0,0.0,3.0,1.0,2.0,4.0,4.0,0.0
75%,18.0,2.0,2.0,0.0,4.0,2.0,3.0,5.0,8.0,0.0
max,22.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,75.0,1.0


Import scikit-learn and build a random forest classifer

In [153]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import train_test_split
import sklearn

# create "x" df of the features that determine the "y" df, split data into training and test
dependent_variable = 'qual_student'
df = pd.get_dummies(df)
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

# benchmark the model by running 10 times to see what the average scores are
accuracy_sum = 0.0
f1_sum = 0.0
runs = 50
for i in range(runs):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # 80/20 train/test split
    clf = rf(n_estimators=1000)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy_sum += sklearn.metrics.accuracy_score(y_test, y_pred)
    f1_sum += sklearn.metrics.f1_score(y_test, y_pred, average='binary')

accuracy_score = accuracy_sum / runs
f1_score = f1_sum / runs

print("Model Statistics on Test Data")
print("Accuracy: " + str(accuracy_score))
print("FScore: " + str(f1_score))


Model Statistics on Test Data
Accuracy: 0.8075949367088611
FScore: 0.19245215153073542


It's not very good! We didn't even cross validate. You'll need to do better :)
Let's export this model so we can use it in a microservice (flask api)

In [154]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'dockerfile/apps/model2.pkl')

['dockerfile/apps/model2.pkl']

In [155]:
query_df = pd.DataFrame({ 'age' : pd.Series(1) ,'health' : pd.Series(15) ,'absences' : pd.Series(10)})



In [156]:
pred = clf.predict(query_df)

ValueError: Number of features of the model must match the input. Model n_features is 17 and input n_features is 3 

In [None]:
x

Unnamed: 0,absences,age,health
0,6,18,3
1,4,17,3
2,10,15,3
3,2,15,5
4,4,16,5
...,...,...,...
390,11,20,4
391,3,17,2
392,3,21,3
393,0,18,5


In [None]:
type(x)

pandas.core.frame.DataFrame