### Creating and Persisting an ML Model

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/student-mat.csv', sep=';')

Summary of the data

In [2]:
df.describe()
df

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [3]:
df.info

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel fre

Create a subset of features as an example.

In [4]:
include = ['school', 'higher','Mjob', 'Fjob', 'studytime', 'paid', 'failures', 'G3']
df.drop(columns=df.columns.difference(include), inplace=True)  # only using 3 features

In [5]:
df.info

<bound method DataFrame.info of      age  health  absences  G3
0     18       3         6   6
1     17       3         4   6
2     15       3        10  10
3     15       5         2  15
4     16       5         4  10
..   ...     ...       ...  ..
390   20       4        11   9
391   17       2         3  16
392   21       3         3   7
393   18       5         0  10
394   19       5         5   9

[395 rows x 4 columns]>

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Becasue we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

In [6]:
df['qual_student'] = np.where(df['G3']>=15, 1, 0)

In [7]:
df.describe()

Unnamed: 0,age,health,absences,G3,qual_student
count,395.0,395.0,395.0,395.0,395.0
mean,16.696203,3.55443,5.708861,10.41519,0.18481
std,1.276043,1.390303,8.003096,4.581443,0.388636
min,15.0,1.0,0.0,0.0,0.0
25%,16.0,3.0,0.0,8.0,0.0
50%,17.0,4.0,4.0,11.0,0.0
75%,18.0,5.0,8.0,14.0,0.0
max,22.0,5.0,75.0,20.0,1.0


Drop the G3 score

In [8]:
include = ['school', 'schoolsup', 'higher','Mjob', 'Fjob', 'studytime', 'paid', 'failures', 'qual_student']
df.drop(columns=df.columns.difference(include), inplace=True) 

Import scikit-learn and build a random forest classifer

In [9]:
import sklearn
from sklearn import preprocessing, neighbors, svm
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import train_test_split

dependent_variable = 'qual_student'

df = pd.get_dummies(df, drop_first=True)
y = df[dependent_variable]
X = df[df.columns.difference([dependent_variable])]

#splitting the train and test sets
X_train, X_test, y_train,y_test= train_test_split(X, y, test_size=0.3)

In [10]:
rfc = rf(criterion='gini', 
        n_estimators=5750,
        max_depth=5,
        min_samples_split=6,
        min_samples_leaf=6,
        max_features='auto',
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        verbose=1)

rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

0.5185185185185185

It's not very good! We didn't even cross validate. You'll need to do better :)
Let's export this model so we can use it in a microservice (flask api)

In [11]:
import joblib
# modify the file path to where you want to save the model
joblib.dump(rfc, 'app/handlers/model3.pkl')

['app/handlers/model.pkl']

In [12]:
query_df = pd.DataFrame({'Fjob_health' : pd.Series(0), 
                         'Fjob_other' : pd.Series(1), 
                         'Fjob_services': pd.Series(0),
                         'Fjob_teacher': pd.Series(0),
                         'Mjob_health': pd.Series(0),
                         'Mjob_other': pd.Series(0),
                         'Mjob_services': pd.Series(0),
                         'Mjob_teacher': pd.Series(1),
                         'failures': pd.Series(5),
                         'higher_yes': pd.Series(1),
                         'paid_yes': pd.Series(1),
                         'school_MS': pd.Series(0),
                         'studytime': pd.Series(20)})
query_df = pd.get_dummies(query_df, drop_first=True).astype(float)

In [13]:
pred = rfc.predict(query_df)

Feature names must be in the same order as they were in fit.



In [14]:
pred

Unnamed: 0,absences,age,health
0,6,18,3
1,4,17,3
2,10,15,3
3,2,15,5
4,4,16,5
...,...,...,...
390,11,20,4
391,3,17,2
392,3,21,3
393,0,18,5


In [15]:
type(x)

pandas.core.frame.DataFrame

In [None]:
type(pred)

In [None]:
from sklearn.feature_selection import SelectFromModel

rfc = rf(criterion='gini', 
        n_estimators=15750,
        max_depth=25,
        min_samples_split=5,
        min_samples_leaf=5,
        max_features='auto',
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        verbose=1)


## Testing Features
df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns=['G3', 'G2', 'G1'], inplace=True) 


# print(df.select_dtypes(exclude=["number","bool_"]).columns)
df = pd.get_dummies(df, columns = df.select_dtypes(exclude=["number","bool_"]).columns)
print(sum(df['qual_student']))

y = df['qual_student']
X = df[df.columns.difference(['qual_student'])]


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

sel = SelectFromModel(rfc)
sel.fit(X_train, y_train)

selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)

print(selected_feat)


In [None]:

col = df.columns.difference(selected_feat)

X = df[df.columns.difference(col)]


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

sel = SelectFromModel(rfc)
sel.fit(X_train, y_train)

selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)

print(selected_feat)

In [None]:
df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns=['G3', 'G2', 'G1'], inplace=True) 

top_values = selected_feat

df = pd.get_dummies(df, columns = df.select_dtypes(exclude=["number","bool_"]).columns)

y = df['qual_student']
X = df[df.columns.difference(['qual_student'])]
col = df.columns.difference(top_values)

X = df[df.columns.difference(col)]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

In [None]:
df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns=['G3', 'G2', 'G1'], inplace=True) 

df = pd.get_dummies(df, columns = df.select_dtypes(exclude=["number","bool_"]).columns)

y = df['qual_student']
X = df[df.columns.difference(['qual_student'])]
print(X.columns)

rfc = rf(criterion='gini', 
        n_estimators=5750,
        max_depth=25,
        min_samples_split=5,
        min_samples_leaf=5,
        max_features='auto',
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        verbose=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

print(rfc.feature_importances_)
importances_df = pd.DataFrame({'variable':X_train.columns, 'importance': rfc.feature_importances_})
top_N = importances_df.sort_values(by=['importance'], ascending=False)
print(top_N)

In [None]:
most_important_features =  ['Medu', 'health', 'absences', 'age', 'Walc']

df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns=['G3', 'G2', 'G1'], inplace=True) 

df = pd.get_dummies(df, columns = df.select_dtypes(exclude=["number","bool_"]).columns)

y = df['qual_student']
col = df.columns.difference(most_important_features)
X = df[df.columns.difference(col)]
print(X.columns)

rfc = rf(criterion='gini', 
        n_estimators=5750,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=5,
        max_features='auto',
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        verbose=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

rfc.fit(X_train, y_train)
print('****Results****')
print("Accuracy: {:.4%}".format(rfc.score(X_test, y_test)))

In [None]:
from sklearn.model_selection import GridSearchCV

rfc2 = rf(criterion='gini', 
        n_estimators=7000,
        oob_score=True,
        random_state=42,
        n_jobs=-1,
        verbose=0)

df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.drop(columns=['G3', 'G2', 'G1'], inplace=True) 


y = df['qual_student']
col = df.columns.difference(['Fedu', 'Medu', 'Walc', 'absences', 'age', 'failures', 'freetime',
       'goout', 'health'])
X = df[df.columns.difference(col)]
X = pd.get_dummies(X, columns = X.select_dtypes(exclude=["number","bool_"]).columns)
print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

param_grid = { 
    'max_features': ['sqrt'],
    'max_depth' : [5, 7, 9, 12],
    'min_samples_split': [4,6,8],
    'min_samples_leaf': [1, 2, 3]
}

CV_rfc = GridSearchCV(estimator=rfc2, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print (CV_rfc.best_params_)

In [None]:
print("Accuracy: {:.4%}".format(CV_rfc.best_estimator_.score(X_test, y_test)))

In [None]:
CV_rfc.best_estimator_.feature_importances_
importances_df = pd.DataFrame({'variable':X_train.columns, 'importance': CV_rfc.best_estimator_.feature_importances_})
top_N = importances_df.sort_values(by=['importance'], ascending=False)
print(top_N)

In [None]:
df = pd.read_csv('data/student-mat.csv', sep=';')
df['qual_student'] = np.where(df['G3']>=15, 1, 0)
df.iloc[391]

In [None]:
df.iloc[391]['absences']
df.iloc[391]['goout']
df.iloc[391]['qual_student']

In [None]:
def add_query_val(row, column):
    return "&" + column + "=" + str(row[column])

def query_create(data):
    result = []
    for i in data.index:
        query = "predict?" + "Fedu=" + str(df.iloc[i]["Fedu"])
        query += add_query_val(df.iloc[i], "Medu")
        query += add_query_val(df.iloc[i], "Walc")
        query += add_query_val(df.iloc[i], "absences")
        query += add_query_val(df.iloc[i], "age")
        query += add_query_val(df.iloc[i], "failures")
        query += add_query_val(df.iloc[i], "freetime")
        query += add_query_val(df.iloc[i], "goout")
        query += add_query_val(df.iloc[i], "health")
        result.append(query)
    return result


In [None]:
query_create(df[df["qual_student"] == 1])