In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
import numpy as np
import pandas as pd
from contextlib import contextmanager
from sklearn.metrics import make_scorer
import warnings
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# In this notebook, we will use mlflow to save different models with different parameters to create a .predict_proba prediction.
# We will then try different decision thresholds on the predicted probability.
# All parameters, inclusing thresholds will be saved in the mlrun folder locally to choose what model is the most likely 
# to be used.

def apply_threshold(predict, threshold):
    """
    Apply a threshold into a probability vector
    
    Parameters
    ----------
    -predict (array of shape (nb_individuals, 2)): the probability vector of shape  
    -threshold (float in range [0,1]): the value from where we set the prediction to class 1 
    
    Returns
    ----------
    -pred (array of shape (nb_individuals, 1)) : class 0 or 1 for each indivuals
    
    """
    pred = predict.copy()
    pred = pred[:, 1]
    matching = (pred >= threshold)
    pred[matching] = 1
    pred[~matching] = 0
    return pred


def bank_scoring(y_test, predict):
    """
    Calculate the bankscore from the real classes vs the predicted classes
    BE CAREFULL TO GIVE THEM IN THIS ORDER : bank_scoring(REAL, PREDICT)
    
    Parameters
    ----------
    -y_test (array of shape (nb_individuals, 1)): the True values of the class for each individuals
    -predict (array of shape (nb_individuals, 1)): the predicted values of the class for each individuals
    
    Returns
    ----------
    -loss_for_bank (int) : self made score to evaluate the models
    
    """
    confu = confusion_matrix(y_test, predict)
    FP = confu[0,1]
    FN = confu[1,0]
    loss_for_bank = 10 * FP + FN
    return loss_for_bank


banklosss = make_scorer(bank_scoring, greater_is_better=False)


def create_test_list(param_list):
    list_param = []
    for i in list(product(*tuple(param_list))):
        list_param.append(list(i))
    return list_param

In [None]:
# We load the datas,separate in test/train. Reminder : the data_final is already normalized.
# We also setup the class weight : when the algo calculate the loss, the model will give a different weight to the
# error on the prediction. this way the algorithm will give more importance to the error made on positive class.

data_final = pd.read_csv('data_after_featuring.csv')
df_copy = data_final.copy()
X = df_copy.loc[:, (df_copy.columns != 'TARGET') &
                (df_copy.columns != 'SK_ID_CURR')]
y = df_copy['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y)

sample_weights = np.zeros(len(y_train))
sample_weights[y_train == 0] = (
    y_train.shape[0] / 2) / y_train[y_train == 0].shape[0]
sample_weights[y_train == 1] = (
    y_train.shape[0] / 2) / y_train[y_train == 1].shape[0]

In [None]:
# The base model will be dummy : always refuse the Loan. Thats also the base of the banscore we made, we need a model
# that goes close to that score, if possible behind, with accepting some clients.

acclist = []
bankloss = []

with mlflow.start_run():

    # Set the model parameters.
    strategy = 'most_frequent'

    # Create and train model.
    dm = DummyClassifier(strategy='most_frequent')
    dm.fit(X_train, y_train)

    # Use the model to make predictions on the test dataset.
    predictions = dm.predict(X_test)

    # Log the model parameters used for this run.
    mlflow.log_param("strategy", strategy)

    # Define a metric to use to evaluate the model.
    acc = accuracy_score(y_test, predictions)
    acclist.append(acc)
    loss_for_bank = bank_scoring(y_test, predictions)
    bankloss.append(loss_for_bank)

    # Log the value of the metric from this run.
    mlflow.log_metric("acc", acc)
    mlflow.log_metric("cost of bad predict", loss_for_bank)

    # Log the model created by this run.
    mlflow.sklearn.log_model(dm, "dummie-model")

In [23]:
# First real model :random forest.


param_list = [[100, 200],
              [10, 20, 30],
              ['log2'],
              [0.4, 0.5, 0.6]]
test_list = create_test_list(param_list)

acclist = []
bankloss = []
for i in test_list:
    with mlflow.start_run():

        # Set the model parameters.
        n_estimators = i[0]
        max_depth = i[1]
        max_features = i[2]
        threshold = i[3]

        # Create and train model.
        rf = RandomForestClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    max_features=max_features)
        rf.fit(X_train, y_train, sample_weight=sample_weights)

        # Use the model to make predictions on the test dataset.
        predictions = rf.predict_proba(X_test)
        predict_thresh = apply_threshold(predictions, threshold)

        # Log the model parameters used for this run.
        mlflow.log_param("num_trees", n_estimators)
        mlflow.log_param("maxdepth", max_depth)
        mlflow.log_param("max_feat", max_features)
        mlflow.log_param("threshold", threshold)

        # Define a metric to use to evaluate the model.
        acc = accuracy_score(y_test, predict_thresh)
        acclist.append(acc)
        loss_for_bank = bank_scoring(y_test, predict_thresh)
        bankloss.append(loss_for_bank)

        # Log the value of the metric from this run.
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("cost of bad predict", loss_for_bank)

        # Log the model created by this run.
        mlflow.sklearn.log_model(rf, "random-forest-model")

In [None]:
# Support vector classifier
param_list = [[0.01, 0.1, 0.5, 1],
              ['linear', 'rbf'],
              [0.01, 0.1, 0.5, 'auto'],
              [0.4, 0.5, 0.6]]

test_list = create_test_list(param_list)

acclist = []
bankloss = []
for i in test_list:
    with mlflow.start_run():

        # Set the model parameters.
        C = i[0]
        kernel = i[1]
        probability = True
        gamma = i[2]
        threshold = i[3]

        # Create and train model.
        if kernel == 'linear':
            bc = SVC(C=C,
                     kernel=kernel,
                     probability=probability)
        else:
            bc = SVC(C=C,
                     kernel=kernel,
                     gamma=gamma,
                     probability=probability)
        bc.fit(X_train, y_train, sample_weight=sample_weights)

        # Use the model to make predictions on the test dataset.
        predictions = bc.predict_proba(X_test)
        predict_thresh = apply_threshold(predictions, threshold)

        # Log the model parameters used for this run.
        mlflow.log_param("C", C)
        mlflow.log_param("kernel", kernel)
        mlflow.log_param("gamma", gamma)
        mlflow.log_param("threshold", threshold)

        # Define a metric to use to evaluate the model.
        acc = accuracy_score(y_test, predict_thresh)
        acclist.append(acc)
        loss_for_bank = bank_scoring(y_test, predict_thresh)
        bankloss.append(loss_for_bank)

        # Log the value of the metric from this run.
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("cost of bad predict", loss_for_bank)

        # Log the model created by this run.
        mlflow.sklearn.log_model(rf, "bagging_class")

In [25]:
# Bagging classifieur
param_list = [[10, 20],
              [0.8, 1],
              [0.8, 1],
              [0.3, 0.4, 0.5, 0.6]
              ]

test_list = create_test_list(param_list)

acclist = []
bankloss = []
for i in test_list:
    with mlflow.start_run():

        # Set the model parameters.
        n_estimators = i[0]
        max_samples = i[1]
        max_features = i[2]
        threshold = i[3]

        # Create and train model.
        bg = BaggingClassifier(n_estimators=n_estimators,
                               max_samples=max_samples,
                               max_features=max_features)
        bg.fit(X_train, y_train, sample_weight=sample_weights)

        # Use the model to make predictions on the test dataset.
        predictions = bg.predict_proba(X_test)
        predict_thresh = apply_threshold(predictions, threshold)

        # Log the model parameters used for this run.
        mlflow.log_param("num_try_bagg", n_estimators)
        mlflow.log_param("max_sample_bagg", max_samples)
        mlflow.log_param("max_feat_bagg", max_features)
        mlflow.log_param("threshold", threshold)

        # Define a metric to use to evaluate the model.
        acc = accuracy_score(y_test, predict_thresh)
        acclist.append(acc)
        loss_for_bank = bank_scoring(y_test, predict_thresh)
        bankloss.append(loss_for_bank)

        # Log the value of the metric from this run.
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("cost of bad predict", loss_for_bank)

        # Log the model created by this run.
        mlflow.sklearn.log_model(bg, "bagging-model")

In [6]:
# Gradient booster classifieur
param_list = [[0.2, 0.5],
              [50, 100],
              [10, 20, 30],
              ['log2'],
              [0.5]]

test_list = create_test_list(param_list)

acclist = []
bankloss = []
for i in test_list:
    with mlflow.start_run():

        # Set the model parameters.
        learning_rate = i[0]
        n_estimators = i[1]
        max_depth = i[2]
        max_features = i[3]
        threshold = i[4]

        # Create and train model.
        gbc = GradientBoostingClassifier(learning_rate=learning_rate,
                                         max_features=max_features,
                                         max_depth=max_depth,
                                         n_estimators=n_estimators)
        gbc.fit(X_train, y_train, sample_weight=sample_weights)

        # Use the model to make predictions on the test dataset.
        predictions = gbc.predict_proba(X_test)
        predict_thresh = apply_threshold(predictions, threshold)

        # Log the model parameters used for this run.
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_feat", max_features)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("threshold", threshold)

        # Define a metric to use to evaluate the model.
        acc = accuracy_score(y_test, predict_thresh)
        acclist.append(acc)
        loss_for_bank = bank_scoring(y_test, predict_thresh)
        bankloss.append(loss_for_bank)

        # Log the value of the metric from this run.
        mlflow.log_metric("acc", acc)
        mlflow.log_metric("cost of bad predict", loss_for_bank)

        # Log the model created by this run.
        mlflow.sklearn.log_model(gbc, "GradientBoostingClassifier-model")
