### Creating and Persisting an ML Model

In [518]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Create a subset of features as an example.

In [519]:
include = ['health', 'absences','age','G1','G2','G3']
df.drop(columns=df.columns.difference(include), inplace=True)  # only using 3 features

Unnamed: 0,address,Pstatus,Medu,Fedu,studytime,failures,activities,higher,internet,absences,G3
0,U,A,4,4,2,0,no,yes,no,6,6
1,U,T,1,1,2,0,no,yes,yes,4,6
2,U,T,1,1,2,3,no,yes,yes,10,10
3,U,T,4,2,3,0,yes,yes,yes,2,15
4,U,T,3,3,2,0,no,yes,no,4,10


The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [520]:
from sklearn.utils import resample

df['qual_student'] = np.where(df['G3']>=15, 1, 0)

df_majority = df[df.qual_student == 0]
df_minority = df[df.qual_student == 1]

# df_majority_downsampled = resample(df_majority, replace=False, n_samples=df_minority.shape[0])
# df = pd.concat([df_majority_downsampled, df_minority])

df_minority_upsampled = resample(df_minority, replace=True, n_samples=df_majority.shape[0])
df = pd.concat([df_minority_upsampled, df_majority])

df.describe()
# df.qual_student.value_counts()

Unnamed: 0,Medu,Fedu,studytime,failures,absences,G3,qual_student
count,644.0,644.0,644.0,644.0,644.0,644.0,644.0
mean,2.913043,2.59472,2.124224,0.226708,5.100932,12.714286,0.5
std,1.072149,1.082321,0.892392,0.627147,6.971441,4.696948,0.500389
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,0.0,0.0,10.0,0.0
50%,3.0,3.0,2.0,0.0,3.0,14.5,0.5
75%,4.0,4.0,3.0,0.0,7.0,16.0,1.0
max,4.0,4.0,4.0,3.0,75.0,20.0,1.0


Drop the G3 score

In [521]:
include = ['health', 'absences','age','G1','G2','qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

Unnamed: 0,address,Pstatus,Medu,Fedu,studytime,failures,activities,higher,internet,absences,qual_student
376,1,0,4,2,3,2,0,1,1,4,1
8,1,1,3,2,2,0,0,1,1,0,1
323,1,0,3,1,3,0,0,1,1,1,1
31,1,0,4,4,2,0,1,1,1,0,1
47,1,0,4,3,4,0,1,1,1,4,1


Import scikit-learn and build a random forest classifer

In [526]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import sklearn
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[[dependent_variable]]

#splitting the train and test sets
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.5)

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), max_iter=100)
clf = rf(n_estimators = 10)
clf.fit(X_train, y_train)

# Determine the training accuracy of the model
print("Training Accuracy {:.4%}".format(clf.score(X_train, y_train)))

Training Accuracy 96.2733%


  clf.fit(X_train, y_train)


In [527]:
# Determine the test accuracy of the model
print("Test Accuracy {:.4%}".format(clf.score(X_test, y_test)))

Test Accuracy 81.0559%


Let's export this model so we can use it in a microservice (flask api)

In [472]:
import joblib
# save the model and the one hot encoding
joblib.dump(clf, './dockerfile/apps/model.pkl')

['./dockerfile/apps/model.pkl']

In [477]:
query_df = pd.DataFrame({
    'address': pd.Series('R'),
    'Pstatus': pd.Series('T'),
    'Medu': pd.Series(4), 
    'Fedu': pd.Series(4), 
    'studytime': pd.Series(4), 
    'activities': pd.Series('yes'),
    'higher': pd.Series('yes'),
    'internet': pd.Series('yes'),
    'failures': pd.Series(0), 
    'absences': pd.Series(1)
})

query_df['address'] = np.where(query_df['address']=='U', 1, 0)
query_df['Pstatus'] = np.where(query_df['Pstatus']=='A', 1, 0)
query_df['activities'] = np.where(query_df['activities']=='yes', 1, 0)
query_df['higher'] = np.where(query_df['higher']=='yes', 1, 0)
query_df['internet'] = np.where(query_df['internet']=='yes', 1, 0)

In [478]:
pred = clf.predict(query_df)
pred

Feature names must be in the same order as they were in fit.



array([0])