### Creating and Persisting an ML Model

In [18]:
import pandas as pd
import numpy as np

Summary of the data

In [19]:
df = pd.read_csv('data/student-mat.csv', sep=';')
# df.describe()
# df.info

The goal is to predict the quality of the student. We will build a predictor based on the final grade (G3).
Because we are trying to find quality students. In this model we define a quality student as one who achieves a final grade of 15 or higher. 

Import scikit-learn and build a random forest classifer

In [20]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.svm import SVC

# Lists of all useable features in df (except 'G3' and 'qual_student')
all_features = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
    'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
    'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
    'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
    'Walc', 'health', 'absences', 'G1', 'G2']
all_numerical_features = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
    'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']


# Get all data
df['qual_student'] = np.where(df['G3']>=15, 1, 0)    
X = df[all_features]
y = df['qual_student']

# Categorical vs Numerical Preprocessing
# To find out how to split and use both numerical and categorical features, see 
# https://inria.github.io/scikit-learn-mooc/python_scripts/03_categorical_pipeline_column_transformer.html
numerical_selector   = make_column_selector(dtype_include='number')
categorical_selector = make_column_selector(dtype_exclude='number')
preprocessor = ColumnTransformer([
    ('standard_scalar', StandardScaler(), numerical_selector),  # numerical features
    ('one_hot_encoder', OrdinalEncoder(), categorical_selector) # categorical features
    ], verbose_feature_names_out=False)                         # no feature prefixes

# Select k best features in X
selector = make_pipeline(preprocessor, SelectKBest(k=5))
include = selector.fit(X, y).get_feature_names_out()
# k=5:  ['Medu', 'failures', 'absences', 'G1', 'G2']
# however, let's remove 'Medu' because may be too personal
include = ['failures', 'absences', 'G1', 'G2']
print(include)

# Reduce data to the k best features
X_reduced = X[include]
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, random_state=42)

# Train the model using Logistic Regression. Use GridSearchCV to find params
parameters = {
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}
numerical_selector   = make_column_selector(dtype_include='number')
categorical_selector = make_column_selector(dtype_exclude='number')
preprocessor = StandardScaler() # we can do this b/c all features are numerical now
search = GridSearchCV(LogisticRegression(),     # model
                    param_grid = parameters,    # hyperparameters
                    scoring='f1',               # metric for scoring
                    cv=5)                       # number of folds
clf = make_pipeline(preprocessor, search)
clf.fit(X_train, y_train)

# Get prediction scores
def print_scores(X, y):
    # Get predictions
    y_pred = clf.predict(X)

    # Print Scores
    acc = accuracy_score(y, y_pred)
    f1  = f1_score(y, y_pred, average='binary')
    print('accuracy = {},\tF1 = {}'.format(acc, f1))

print('TRAIN:', end=' ')
print_scores(X_train, y_train)
print('TEST: ', end=' ')
print_scores(X_test, y_test)

['failures', 'absences', 'G1', 'G2']
TRAIN: accuracy = 0.9864864864864865,	F1 = 0.9591836734693877
TEST:  accuracy = 0.9696969696969697,	F1 = 0.9302325581395349


Let's export this model so we can use it in a microservice (flask api)

In [21]:
import joblib
# TODO: Change these lines
# modify the file path to where you want to save the model
joblib.dump(clf, 'app/handlers/model.pkl')
query_df = pd.DataFrame({ 'failures' : pd.Series(3),
                          'absences' : pd.Series(3),
                          'G1' : pd.Series(7),
                          'G2' : pd.Series(11)})
pred = clf.predict(query_df)
print(pred)

[0]
