### Creating and Persisting an ML Model

In [45]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Create a subset of features as an example.

In [46]:
include = ['address', 'Pstatus','Medu','Fedu','studytime','activities', 'higher', 'internet', 'absences', 'failures']
df.drop(columns=df.columns.difference(include + ['G3']), inplace=True)  # only using 3 features
df.head(n=1)

Unnamed: 0,address,Pstatus,Medu,Fedu,studytime,failures,activities,higher,internet,absences,G3
0,U,A,4,4,2,0,no,yes,no,6,6


Defining a function to change the string data to numerical

In [47]:
def encode_data(df):
    df['address'] = np.where(df['address']=='U', 1, 0)
    df['Pstatus'] = np.where(df['Pstatus']=='T', 1, 0)
    df['activities'] = np.where(df['activities']=='yes', 1, 0)
    df['higher'] = np.where(df['higher']=='yes', 1, 0)
    df['internet'] = np.where(df['internet']=='yes', 1, 0)
    return df

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [48]:
from sklearn.utils import resample

df['qual_student'] = np.where(df['G3']>=15, 1, 0)

df_majority = df[df.qual_student == 0]
df_minority = df[df.qual_student == 1]

# df_majority_downsampled = resample(df_majority, replace=False, n_samples=df_minority.shape[0])
# df = pd.concat([df_majority_downsampled, df_minority])

df_minority_upsampled = resample(df_minority, replace=True, n_samples=df_majority.shape[0])
df = pd.concat([df_minority_upsampled, df_majority])

df.describe()
# df.qual_student.value_counts()

Unnamed: 0,Medu,Fedu,studytime,failures,absences,G3,qual_student
count,644.0,644.0,644.0,644.0,644.0,644.0,644.0
mean,2.92236,2.597826,2.119565,0.214286,4.731366,12.664596,0.5
std,1.092971,1.082044,0.870989,0.611487,7.048656,4.654059,0.500389
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,2.0,0.0,0.0,10.0,0.0
50%,3.0,3.0,2.0,0.0,2.0,14.5,0.5
75%,4.0,4.0,3.0,0.0,6.0,16.0,1.0
max,4.0,4.0,4.0,3.0,75.0,20.0,1.0


Drop the G3 score and remove string data

In [49]:
df.drop(columns=df.columns.difference(include + ['qual_student']), inplace=True) 
df = encode_data(df)
df.head(n=1)

Unnamed: 0,address,Pstatus,Medu,Fedu,studytime,failures,activities,higher,internet,absences,qual_student
27,1,1,4,2,1,0,0,1,1,4,1


Import scikit-learn and build a random forest classifer

In [50]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import sklearn
dependent_variable = 'qual_student'
x = df[df.columns.difference([dependent_variable])]
y = df[[dependent_variable]]

#splitting the train and test sets
X_train, X_test, y_train,y_test= train_test_split(x, y, test_size=0.5)

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), max_iter=100)
clf = rf(n_estimators = 10)
clf.fit(X_train, y_train)

# Determine the training accuracy of the model
print("Training Accuracy {:.4%}".format(clf.score(X_train, y_train)))

Training Accuracy 95.9627%


  clf.fit(X_train, y_train)


In [51]:
# Determine the test accuracy of the model
print("Test Accuracy {:.4%}".format(clf.score(X_test, y_test)))

Test Accuracy 85.0932%


Let's export this model so we can use it in a microservice (flask api)

In [52]:
import joblib
# save the model
joblib.dump(clf, './dockerfile/apps/model.pkl')

['./dockerfile/apps/model.pkl']

In [53]:
query_df = pd.DataFrame({
    'address': pd.Series('U'),
    'Pstatus': pd.Series('T'),
    'Medu': pd.Series(1), 
    'Fedu': pd.Series(3), 
    'studytime': pd.Series(4), 
    'activities': pd.Series('no'),
    'higher': pd.Series('yes'),
    'internet': pd.Series('yes'),
    'failures': pd.Series(0), 
    'absences': pd.Series(1)
})

query_df = encode_data(query_df)

pred = clf.predict(query_df)
pred

Feature names must be in the same order as they were in fit.



array([0])

In [54]:
pred = clf.predict(np.array(X_test.values.tolist()[1]).reshape(1, 10))
pred



array([1])

In [55]:
np.array(X_test.values.tolist()[1]).reshape(1, 10)

array([[3, 3, 1, 3, 1, 1, 0, 1, 1, 1]])