### Creating and Persisting an ML Model

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn
df = pd.read_csv('data/student-mat.csv', sep=';')
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [2]:
training_indices = [229, 247, 346, 288, 143, 212, 258, 265, 189, 385, 106, 284, 
136, 327, 180, 174, 190, 99, 172, 306, 41, 391, 211, 226, 209, 168, 371, 154, 
85, 91, 310, 44, 72, 195, 287, 271, 255, 75, 241, 204, 133, 243, 7, 23, 140, 113, 
185, 5, 382, 268, 364, 210, 46, 305, 367, 389, 321, 119, 353, 236, 267, 39, 257, 393, 
63, 94, 50, 338, 330, 181, 203, 55, 13, 109, 196, 70, 264, 347, 62, 324, 331, 234, 238, 
313, 381, 383, 295, 96, 157, 8, 370, 123, 291, 298, 4, 227, 394, 349, 128, 150, 225, 175, 93, 
343, 376, 359, 147, 315, 111, 240, 122, 24, 299, 166, 297, 20, 149, 362, 187, 15, 355, 156, 
216, 374, 138, 2, 28, 31, 188, 89, 283, 239, 377, 1, 354, 169, 273, 201, 259, 218, 333, 167, 
318, 342, 358, 290, 345, 214, 88, 141, 253, 73, 270, 129, 351, 199, 260, 252, 282, 329, 289, 116, 
350, 121, 339, 115, 296, 84, 131, 279, 57, 155, 266, 390, 37, 231, 248, 9, 58, 117, 263, 308, 302, 
153, 285, 369, 40, 192, 222, 215, 388, 162, 124, 120, 34, 52, 183, 173, 365, 98, 200, 49, 159, 151, 
312, 228, 207, 348, 92, 366, 235, 320, 22, 130, 178, 202, 326, 378, 11, 294, 179, 10, 380, 205, 
194, 328, 392, 25, 386, 311, 303, 87, 269, 224, 127, 363, 135, 90, 64, 256, 337, 32, 146, 145, 184, 
56, 100, 206, 97, 356, 95, 74, 317, 344, 319, 45, 340, 307, 132, 3, 81, 108, 232, 112, 148, 242, 163, 
213, 361, 38, 223, 142, 27, 83, 276, 105, 0, 245, 59, 262, 29, 198, 373, 65, 101, 221, 332, 275, 43, 251, 
191, 125, 219, 71, 250, 19, 309, 53, 182, 21, 230, 322, 341, 186, 16, 26, 208, 244, 42, 48, 
86, 134, 171, 103, 237, 379]

training_data = df.iloc[training_indices]
testing_data = df.iloc[[i for i in range(len(df)) if i not in training_indices]]

Create a subset of features as an example.

In [3]:
def df_to_grade_features(df):
    result = df[['failures', 'G1', 'G2', 'G3']]
    result["sex"] = (df["sex"] == "male").astype(int)
    result["school"] = (df["school"] == "GP").astype(int)
    return result

df_to_grade_features(training_data)

Unnamed: 0,failures,G1,G2,G3,sex,school
229,0,12,10,12,0,1
247,3,6,8,8,0,1
346,0,16,15,16,0,1
288,0,15,14,14,0,1
143,0,14,14,13,0,1
...,...,...,...,...,...,...
134,0,9,0,0,0,1
171,0,13,15,16,0,1
103,0,7,6,6,0,1
237,0,13,12,12,0,1


In [4]:
def df_to_predict_features(df):
    result = df[['failures', 'G1', 'G2']]
    result["sex"] = (df["sex"] == "male").astype(int)
    result["school"] = (df["school"] == "GP").astype(int)
    result["accept"] = (df["G3"] >= 15).astype(int)
    return result

df_to_predict_features(training_data)

Unnamed: 0,failures,G1,G2,sex,school,accept
229,0,12,10,0,1,0
247,3,6,8,0,1,0
346,0,16,15,0,1,1
288,0,15,14,0,1,0
143,0,14,14,0,1,0
...,...,...,...,...,...,...
134,0,9,0,0,1,0
171,0,13,15,0,1,1
103,0,7,6,0,1,0
237,0,13,12,0,1,0


Import scikit-learn and build a random forest classifer

In [5]:
def make_grade_model(data):
    dependent_variable = 'G3'
    features = df_to_grade_features(data)
    x = features[features.columns.difference([dependent_variable])]
    y = features[dependent_variable]
    clf = rf(n_estimators = 1000)
    clf.fit(x, y)
    return clf
    
grade_model = make_grade_model(training_data)
joblib.dump(grade_model, 'app/handlers/grade_model.pkl')

['app/handlers/grade_model.pkl']

In [6]:
def make_predict_model(data):
    dependent_variable = 'accept'
    features = df_to_predict_features(data)
    x = features[features.columns.difference([dependent_variable])]
    y = features[dependent_variable]
    clf = rf(n_estimators = 1000)
    clf.fit(x, y)
    return clf
    
predict_model = make_predict_model(training_data)
joblib.dump(predict_model, 'app/handlers/predict_model.pkl')

['app/handlers/predict_model.pkl']

In [7]:
def test_grade_model(data):
    dependent_variable = 'G3'
    features = df_to_grade_features(testing_data)
    x = features[features.columns.difference([dependent_variable])]
    y = features[dependent_variable]
    pred = grade_model.predict(x)
    return pd.DataFrame({"Answer": pred, "Expected": y, "Error": abs(pred - y)})

grade_outcome = test_grade_model(testing_data)
grade_outcome

Unnamed: 0,Answer,Expected,Error
6,12,11,1
12,14,14,0
14,15,16,1
17,10,10,0
18,0,5,5
...,...,...,...
368,10,10,0
372,11,11,0
375,8,10,2
384,0,5,5


In [8]:
grade_outcome["Error"].mean()

1.4303797468354431

In [9]:
def test_predict_model(data):
    dependent_variable = 'accept'
    features = df_to_predict_features(testing_data)
    x = features[features.columns.difference([dependent_variable])]
    y = features[dependent_variable]
    pred = predict_model.predict(x)
    return pd.DataFrame({"Answer": pred, "Expected": y, "Error": abs(pred - y)})

predict_outcome = test_predict_model(testing_data)
predict_outcome

Unnamed: 0,Answer,Expected,Error
6,0,0,0
12,0,0,0
14,1,1,0
17,0,0,0
18,0,0,0
...,...,...,...
368,0,0,0
372,0,0,0
375,0,0,0
384,0,0,0


In [10]:
predict_outcome["Error"].mean()

0.012658227848101266