### Creating and Persisting an ML Model

In [277]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [278]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [279]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

Create a subset of features as an example.

In [280]:
include = ['health', 'absences', 'studytime', 'Medu', 'Fedu', 'Dalc',
           'freetime', 'schoolsup', 'higher', 'internet', 'activities','G3']
df.drop(columns=df.columns.difference(include), inplace=True)  # only using 3 features

In [281]:
df.info

<bound method DataFrame.info of      Medu  Fedu  studytime schoolsup activities higher internet  freetime  \
0       4     4          2       yes         no    yes       no         3   
1       1     1          2        no         no    yes      yes         3   
2       1     1          2       yes         no    yes      yes         3   
3       4     2          3        no        yes    yes      yes         2   
4       3     3          2        no         no    yes       no         3   
..    ...   ...        ...       ...        ...    ...      ...       ...   
390     2     2          2        no         no    yes       no         5   
391     3     1          1        no         no    yes      yes         4   
392     1     1          1        no         no    yes       no         5   
393     3     2          1        no         no    yes      yes         4   
394     1     1          1        no         no    yes      yes         2   

     Dalc  health  absences  G3  
0       1

In [282]:
df['schoolsup'] = df['schoolsup'].replace(to_replace=['no', 'yes'], value=[0, 1])
df['higher'] = df['schoolsup'].replace(to_replace=['no', 'yes'], value=[0, 1])
df['internet'] = df['schoolsup'].replace(to_replace=['no', 'yes'], value=[0, 1])
df['activities'] = df['schoolsup'].replace(to_replace=['no', 'yes'], value=[0, 1])

In [283]:
df.info

<bound method DataFrame.info of      Medu  Fedu  studytime  schoolsup  activities  higher  internet  freetime  \
0       4     4          2          1           1       1         1         3   
1       1     1          2          0           0       0         0         3   
2       1     1          2          1           1       1         1         3   
3       4     2          3          0           0       0         0         2   
4       3     3          2          0           0       0         0         3   
..    ...   ...        ...        ...         ...     ...       ...       ...   
390     2     2          2          0           0       0         0         5   
391     3     1          1          0           0       0         0         4   
392     1     1          1          0           0       0         0         5   
393     3     2          1          0           0       0         0         4   
394     1     1          1          0           0       0         0         2

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [284]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [285]:
df.describe()

Unnamed: 0,Medu,Fedu,studytime,schoolsup,activities,higher,internet,freetime,Dalc,health,absences,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,2.749367,2.521519,2.035443,0.129114,0.129114,0.129114,0.129114,3.235443,1.481013,3.55443,5.708861,10.41519,0.18481
std,1.094735,1.088201,0.83924,0.335751,0.335751,0.335751,0.335751,0.998862,0.890741,1.390303,8.003096,4.581443,0.388636
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,3.0,0.0,8.0,0.0
50%,3.0,2.0,2.0,0.0,0.0,0.0,0.0,3.0,1.0,4.0,4.0,11.0,0.0
75%,4.0,3.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,5.0,8.0,14.0,0.0
max,4.0,4.0,4.0,1.0,1.0,1.0,1.0,5.0,5.0,5.0,75.0,20.0,1.0


Drop the G3 score

In [286]:
include = ['health', 'absences','Dalc','studytime', 'Medu', 'Fedu', 
           'freetime', 'schoolsup', 'higher', 'internet', 'activities', 'paid','qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

In [287]:
df.describe()
df.info

<bound method DataFrame.info of      Medu  Fedu  studytime  schoolsup  activities  higher  internet  freetime  \
0       4     4          2          1           1       1         1         3   
1       1     1          2          0           0       0         0         3   
2       1     1          2          1           1       1         1         3   
3       4     2          3          0           0       0         0         2   
4       3     3          2          0           0       0         0         3   
..    ...   ...        ...        ...         ...     ...       ...       ...   
390     2     2          2          0           0       0         0         5   
391     3     1          1          0           0       0         0         4   
392     1     1          1          0           0       0         0         5   
393     3     2          1          0           0       0         0         4   
394     1     1          1          0           0       0         0         2

Import scikit-learn and build a random forest classifer

In [288]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree


dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]

#splitting the train and test sets
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.3)

In [289]:
clf = rf(n_estimators = 100)
clf.fit(X_train, y_train)

RandomForestClassifier()

In [290]:
from sklearn.metrics import accuracy_score
train_pred = clf.predict(X_train)
sklearn.metrics.f1_score(y_train, train_pred, average='binary')

1.0

In [291]:
accuracy_score(y_train, train_pred)

1.0

In [292]:
pred = clf.predict(X_test)
sklearn.metrics.f1_score(y_test, pred, average='binary')

0.3448275862068966

In [293]:
accuracy_score(y_test, pred)

0.8403361344537815

In [294]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_test, y_test, cv=3)

array([0.85      , 0.825     , 0.87179487])

In [295]:
cv_results = cross_validate(clf, X_test, y_test, cv=3)
sorted(cv_results.keys())
['fit_time', 'score_time', 'test_score']
cv_results['test_score']

array([0.85      , 0.825     , 0.87179487])

In [296]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(clf, 'home/matrix/dockerfile/apps/model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'home/matrix/dockerfile/apps/model.pkl'

In [299]:
query_df = pd.DataFrame({ 'Medu': pd.Series(4) , 'Fedu': pd.Series(4),  'studytime' : pd.Series(10),
                         'schoolsup' : pd.Series(1),
                'activities' : pd.Series(1),
                       'higher' : pd.Series(1), 'internet' : pd.Series(1), 'freetime': pd.Series(1), 
                         'Dalc' : pd.Series(1), 'health' : pd.Series(15) , 'absences' : pd.Series(10)
                         
                         })

In [300]:
pred = clf.predict(query_df)

Feature names must be in the same order as they were in fit.



In [301]:
x

Unnamed: 0,Dalc,Fedu,Medu,absences,activities,freetime,health,higher,internet,schoolsup,studytime
0,1,4,4,6,1,3,3,1,1,1,2
1,1,1,1,4,0,3,3,0,0,0,2
2,2,1,1,10,1,3,3,1,1,1,2
3,1,2,4,2,0,2,5,0,0,0,3
4,1,3,3,4,0,3,5,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
390,4,2,2,11,0,5,4,0,0,0,2
391,3,1,3,3,0,4,2,0,0,0,1
392,3,1,1,3,0,5,3,0,0,0,1
393,3,2,3,0,0,4,5,0,0,0,1


In [302]:
type(x)

pandas.core.frame.DataFrame

In [303]:
pred

array([0])