### Creating and Persisting an ML Model

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [None]:
df.describe()

In [None]:
df.info

Create a subset of features as an example.

In [None]:
include = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup','G3']
df.drop(columns=df.columns.difference(include), inplace=True)

In [None]:
df.info

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Because we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [None]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [None]:
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)

In [None]:
df['paid'] = np.where(df['paid']=='yes', 1, 0)

In [None]:
df['internet'] = np.where(df['internet']=='yes', 1, 0)

In [None]:
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

In [None]:
df.describe()

Drop the G3 score

In [None]:
include = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup','qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

Import scikit-learn and build a random forest classifer

In [None]:
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
from sklearn.model_selection import train_test_split
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[dependent_variable]
rfc = rf(criterion='gini', 
                             n_estimators=5000,
                             max_depth=7,
                             min_samples_split=6,
                             min_samples_leaf=6,
                             max_features='auto',
                             oob_score=True,
                             random_state=42,
                             n_jobs=-1,
                             verbose=1)
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.3)
rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

Let's export this model so we can use it in a microservice (flask api)

In [None]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(rfc, 'app/handlers/model.pkl')

In [None]:
query_df = pd.DataFrame({ 'failures' : pd.Series(0) ,'schoolsup' : pd.Series(1) ,'internet' : pd.Series(1), 'studytime' : pd.Series(4), 'absences' : pd.Series(0), 'Medu' : pd.Series(4), 'Fedu' : pd.Series(4), 'paid' : pd.Series(1), 'famsup' : pd.Series(0)})

In [None]:
pred = rfc.predict(query_df)
print(pred)

In [None]:
x

In [None]:
type(x)

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
df = pd.read_csv('data/ProductionData.csv', sep=',')

df.info

this_dir = "app/handlers"
model_path = os.path.join(this_dir, "model.pkl")
rfc = joblib.load(model_path)

predict_rfc_prob = lambda x: rfc.predict_proba(x).astype(float)

df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)
df['paid'] = np.where(df['paid']=='yes', 1, 0)
df['internet'] = np.where(df['internet']=='yes', 1, 0)
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

import lime.lime_tabular
features = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup']
df.drop(columns=df.columns.difference(features), inplace=True) 

lime_explainer = lime.lime_tabular.LimeTabularExplainer(df.to_numpy(), mode = 'classification', feature_names = features, class_names = ['Not Quality', 'Quality'])

person_1_lime = lime_explainer.explain_instance(df.iloc[1978], predict_rfc_prob, num_features = 9)
person_1_lime.show_in_notebook()

In [None]:
# Full data set
import pandas as pd
import numpy as np

df = pd.read_csv('data/ProductionData.csv', sep=',')

df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)
df['paid'] = np.where(df['paid']=='yes', 1, 0)
df['internet'] = np.where(df['internet']=='yes', 1, 0)
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

features = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup']
df.drop(columns=df.columns.difference(features), inplace=True)

df.describe()

In [None]:
# Subset with only quality students
import pandas as pd
import numpy as np

df = pd.read_csv('data/ProductionData.csv', sep=',')

df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)
df['paid'] = np.where(df['paid']=='yes', 1, 0)
df['internet'] = np.where(df['internet']=='yes', 1, 0)
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

dfq = df.drop(df[df['qual_student'] == 1].index)

features = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup']
dfq.drop(columns=dfq.columns.difference(features), inplace=True)

dfq.describe()

In [None]:
# Subset with only NON quality students
import pandas as pd
import numpy as np
import os
import joblib

df = pd.read_csv('data/ProductionData.csv', sep=',')

this_dir = "app/handlers"
model_path = os.path.join(this_dir, "model.pkl")
rfc = joblib.load(model_path)

df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)
df['paid'] = np.where(df['paid']=='yes', 1, 0)
df['internet'] = np.where(df['internet']=='yes', 1, 0)
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

dfnq = df.drop(df[df['qual_student'] == 0].index)

features = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup']
dfnq.drop(columns=dfnq.columns.difference(features), inplace=True)
    
dfnq.describe()

In [None]:
import random
import pandas as pd
import numpy as np
import os
import joblib

df = pd.read_csv('data/ProductionData.csv', sep=',')
# Dataset with 'qual_student' column
dfTRUE = pd.read_csv('data/ProductionData.csv', sep=',')

# Load model
this_dir = "app/handlers"
model_path = os.path.join(this_dir, "model.pkl")
rfc = joblib.load(model_path)

# Prediction function
predict_rfc_prob = lambda x: rfc.predict_proba(x).astype(float)

# Modify values from string to numbers
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df['schoolsup'] = np.where(df['schoolsup']=='yes', 1, 0)
df['paid'] = np.where(df['paid']=='yes', 1, 0)
df['internet'] = np.where(df['internet']=='yes', 1, 0)
df['famsup'] = np.where(df['famsup']=='yes', 1, 0)

# Add qual_student to other dataset
dfTRUE['qual_student'] = np.where(df['G3']>=15, 1, 0)

import lime.lime_tabular
features = ['failures', 'schoolsup', 'internet', 'studytime', 'absences','Medu','Fedu','paid','famsup']
df.drop(columns=df.columns.difference(features), inplace=True)

lime_explainer = lime.lime_tabular.LimeTabularExplainer(df.to_numpy(), mode = 'classification', feature_names = features, class_names = ['Not Quality', 'Quality'])

def getSample():
    for i in range(30):
        indexList = []
        randIndex = random.randrange(0,20000)
        if not randIndex in indexList:
            indexList.append(randIndex)
        else:
            while randIndex in indexList:
                randIndex = random.randrange(0,20000)
        print("Index: " + str(randIndex))
        print("Quality student: " + str(dfTRUE.loc[randIndex,"qual_student"]))
        person_1_lime = lime_explainer.explain_instance(df.iloc[randIndex], predict_rfc_prob, num_features = 9)
        person_1_lime.show_in_notebook()
        
getSample()