# Monitoring ML Training Pipeline: Model Training


**Quick recap:**
- Goal: 
    - Building a classication model for loan eligibility that predicts whether a loan is to be given or refused
    - Introduce autonomous monitoring checkpoints orchestrated with Airflow DAGS
- Download raw data: `raw/12196ecaa65e4831987aee4bfced5f60_2015-01-01_2015-05-31.csv`
- Preprocessed the data into:
    - training dataset: `preprocessed/12196ecaa65e4831987aee4bfced5f60.csv`
    - test dataset: `preprocessed/12196ecaa65e4831987aee4bfced5f60.csv`

**Next steps:**
- load training and test datasets
- check sanity
- train multiple models: randomForest, gradientBoosting
- select the best model
    - auc >= 0.7
    - abs(auc_train - auc_test) <=0.1
- deploy model: compare the best model to last deployed model

In [12]:
import datetime
import sys
import os
import json
import re
import pickle
import traceback
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score,  accuracy_score, f1_score, precision_score, recall_score, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pprint import pprint
from importlib import reload

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'dags', 'src'))

import helpers
import config

reload(helpers)
reload(config)

[INFO] Connection to `database-4.cdnugfjpq15f.us-east-1.rds.amazonaws.com:loan_eligibility` initiated!


<module 'config' from 'd:\\Supriya projects\\ML Monitoring Camille\\main\\dags\\src\\config.py'>

In [13]:
#### helpers.py methods ####
def load_dataset(path:str) -> pd.DataFrame:
    """
    Load data set.
    :param path: str
    :return: DataFrame
    """
    return pd.read_csv(path)

def check_dataset_sanity(df:pd.DataFrame) -> bool:
    nulls = df.isnull().sum()
    nulls = nulls[nulls>0].index.tolist()
    if len(nulls)==0:
        return True
    else:
        raise(Exception(f"There are null values in the training dataset: {nulls}"))

def save_model_as_pickle(model, model_name, directory=None):
    """
    Save a model as a pickle file.
    :param model: AnyType
    :param model_name: str
    :param directory: str
    :return: None
    """
    if directory:
        filename = os.path.join(directory, model_name+".pkl")
    else:
        filename = os.path.join(config.PATH_DIR_MODELS, model_name+".pkl")
    with open(filename, "wb") as f:
        pickle.dump(model, f)
    print("[INFO] Model saved as pickle file:", filename)

def save_model_as_json(model:dict, model_name:str, directory:str=None):
    """
    Save a model as a json file.
    :param model: dict
    :param model_name: str
    :param directory: str
    :return: None
    """
    if directory:
        filename = os.path.join(directory, model_name+".json")
    else:
        filename = os.path.join(config.PATH_DIR_MODELS, model_name+".json")
    with open(filename, "w") as f:
        json.dump(model, f)
    print("[INFO] Model saved as json file:", filename)

def persist_deploy_report(job_id:str, model_name:str):
    """
    Persist the deploy report of a job.
    :param job_id: str
    :return: None
    """
    report = {
        "job_id": job_id,
        "purpose_to_int": f"{job_id}_purpose_to_int_model.json",
        "missing_values": f"{job_id}_missing_values_model.pkl",
        "prediction_model": f"{model_name}.pkl",
        "train_report": f"{job_id}_train_report.json",
    }
    json.dump(report, open(os.path.join(config.PATH_DIR_MODELS, f"deploy_report.json"), "w"))
    print(f'[INFO] Deployment report saved as {os.path.join(config.PATH_DIR_MODELS, f"deploy_report.json")}')


In [14]:
##### train.py methods #####
def train(train_dataset_filename:str=None, test_dataset_filename:str=None, job_id="", rescale=False):
    """
    Train a model on the train dataset loaded from `train_dataset_filename` and test dataset loaded from `test_dataset_filename`
    :param train_dataset_filename: str
    :param test_dataset_filename: str
    :param job_id: str
    :param rescale: bool, if true, scaled numerical variables used
    :return: None
    """
    if train_dataset_filename==None:
        train_dataset_filename = os.path.join(config.PATH_DIR_DATA, "preprocessed", f"{job_id}_training.csv")
    if test_dataset_filename==None:
        test_dataset_filename = os.path.join(config.PATH_DIR_DATA, "preprocessed", f"{job_id}_inference.csv")
    tdf = helpers.load_dataset(train_dataset_filename)
    vdf = helpers.load_dataset(test_dataset_filename)
    helpers.check_dataset_sanity(tdf)
    helpers.check_dataset_sanity(vdf)
    
    predictors = config.PREDICTORS
    target = config.TARGET
    if rescale:
        for col in predictors:
            if f"{config.RESCALE_METHOD}_{col}" in tdf.columns:
                tdf[col] = tdf[f"{config.RESCALE_METHOD}_{col}"]
            if f"{config.RESCALE_METHOD}_{col}" in vdf.columns:
                vdf[col] = vdf[f"{config.RESCALE_METHOD}_{col}"]
        
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=config.RANDOM_SEED)
    gb = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=config.RANDOM_SEED)
    X, Y = tdf[predictors], tdf[target]
    report = dict()
    models = dict()
    for cl, name in [(rf, "rf"), (gb, "gb")]:
        print("[INFO] Training model:", name)
        cl.fit(X, Y)
        t_pred = cl.predict(X)
        v_pred = cl.predict(vdf[predictors])
        t_prob = cl.predict_proba(X)[:, 1]
        v_prob = cl.predict_proba(vdf[predictors])[:, 1]
        report[f"{name}_train"] = performance_report(Y, t_pred, t_prob)
        report[f"{name}_test"] = performance_report(vdf[target], v_pred, v_prob)
        models[name] = cl
        
    model_name = select_model(pd.DataFrame(report), metric=config.MODEL_PERFORMANCE_METRIC, model_names=list(models.keys()))
    report["final_model"] = model_name
    helpers.save_model_as_pickle(models[model_name], f"{job_id}_{model_name}")
    helpers.save_model_as_json(report, f"{job_id}_train_report")
    return report

def performance_report(y_true, y_pred, y_prob):
    """
    Generate performance report for a model.
    :param y_true: np.array, true value
    :param y_pred: np.array, predicted values
    :param y_prob: np.array, prediction probability
    :return: dict
    """
    report = dict()
    report["dataset size"] = y_true.shape[0]
    report["positive rate"] = y_true.sum()/y_true.shape[0]
    report["accuracy"] = accuracy_score(y_true, y_pred)
    report["f1"] = f1_score(y_true, y_pred)
    report["precision"] = precision_score(y_true, y_pred)
    report["recall"] = recall_score(y_true, y_pred)
    report["auc"] = roc_auc_score(y_true, y_prob)
    return report

def select_model(df:pd.DataFrame, metric:str=config.MODEL_PERFORMANCE_METRIC, model_names:list=["rf", "gb"], performance_thresh:float=config.MODEL_PERFORMANCE_THRESHOLD, degradation_thresh:float=config.MODEL_DEGRADATION_THRESHOLD)->str:
    """
    Select the best model based on their performance reports.
        - metric >= performance_thresh where metric can be auc, recall, precision, f1_score, ... and performance_thresh is any value between 0.0 and 1.0
        - abs(<metric>_train - <metric>_test) <= degradation_thresh
    :param df: pd.DataFrame, performance report
    :param metric: str, metric to select the best model.
    :param model_names: list, model names to select from.
    :param performance_thresh: float, threshold for the performance.
    :return: str, model name.
    """
    degradation_performance = []
    for model in model_names:
        if df.loc[metric, f"{model}_train"] < performance_thresh:
            continue
        degradation = df.loc[metric, f"{model}_train"] - df.loc[metric, f"{model}_test"]
        if degradation < degradation_thresh:
            degradation_performance.append((model, degradation))
    if len(degradation_performance) == 0:
        raise(Exception("No model selected: all models have performance below the threshold. Possible overfitting."))
    return min(degradation_performance, key=lambda x: x[1])[0]

def pick_model_and_deploy(job_id, models, df, metric="auc", predictors=config.PREDICTORS, target=config.TARGET)->str:
    """
    Among all `models`, select the model that performs best on df and mark it for deployment.
    :param job_id: str, job id.
    :param models: list of key-value items {"job_id": <str>, "purpose_to_int: <str>, "missing_values": <str>, "prediction_model": <>, "train_report": <str>}
    :param df: pd.DataFrame, test dataset
    :param metric: str, metric used to select the best model.
    :param predictors: list, predictors to use.
    :param target: str, target to use.
    :return: str
    """
    assert len(models) > 0, "`models` cannot be empty"
    if len(models)==1:
        model_name = models[0]["model_name"]
        helpers.persist_deploy_report(job_id, model_name)
        return model_name
    cols = set(predictors).difference(set(df.columns))
    assert cols == set(), f"{cols} not in {df.columns}"
    score = 0
    m_idx = 0
    for i, m in enumerate(models):
        y_true = df[target]
        y_pred = m["model"].predict(df[predictors])
        y_prob = m["model"].predict_proba(df[predictors])[:, 1]
        r = performance_report(y_true, y_pred, y_prob)
        if r[metric] > score:
            score = r[metric]
            m_idx = i
    helpers.persist_deploy_report(job_id, models[m_idx]["model_name"])
    return models[m_idx]["model_name"]

In [18]:
job_id = "12196ecaa65e4831987aee4bfced5f60"
report = train(job_id="12196ecaa65e4831987aee4bfced5f60")
pprint(report)

[INFO] Training model: rf
[INFO] Training model: gb
[INFO] Model saved as pickle file: ../dags/models\12196ecaa65e4831987aee4bfced5f60_rf.pkl
[INFO] Model saved as json file: ../dags/models\12196ecaa65e4831987aee4bfced5f60_train_report.json
{'final_model': 'rf',
 'gb_test': {'accuracy': 0.8914473684210527,
             'auc': 0.9324188014866938,
             'dataset size': 6688,
             'f1': 0.9321875583784792,
             'positive rate': 0.7764653110047847,
             'precision': 0.9051333212407038,
             'recall': 0.9609089158482572},
 'gb_train': {'accuracy': 0.9961346960167715,
              'auc': 0.9999797096398029,
              'dataset size': 15264,
              'f1': 0.9974556901979387,
              'positive rate': 0.7576650943396226,
              'precision': 0.9949242945629732,
              'recall': 1.0},
 'rf_test': {'accuracy': 0.9020633971291866,
             'auc': 0.9377489764649738,
             'dataset size': 6688,
             'f1': 0.93944

In [19]:
model = pick_model_and_deploy(
    job_id=job_id,
    df = pd.read_csv("../dags/data/preprocessed/12196ecaa65e4831987aee4bfced5f60_inference.csv"),
    models = [{
        "model_name": f"{job_id}_{report['final_model']}", 
        "model": pickle.load(open("../dags/models/12196ecaa65e4831987aee4bfced5f60_rf.pkl", "rb"))
    }]
)
print("Deployed model:", model)

[INFO] Deployment report saved as ../dags/models\deploy_report.json
Deployed model: 12196ecaa65e4831987aee4bfced5f60_rf


In [20]:
print("Deloyment Report Sample")
pprint(json.load(open("../dags/models/deploy_report.json", "r")))

Deloyment Report Sample
{'job_id': '12196ecaa65e4831987aee4bfced5f60',
 'missing_values': '12196ecaa65e4831987aee4bfced5f60_missing_values_model.pkl',
 'prediction_model': '12196ecaa65e4831987aee4bfced5f60_rf.pkl',
 'purpose_to_int': '12196ecaa65e4831987aee4bfced5f60_purpose_to_int_model.json',
 'train_report': '12196ecaa65e4831987aee4bfced5f60_train_report.json'}
