# INFO

This notebook will be used for construction and testing purposes while designing model within Kedro framework. 

In [1]:
##############################################################################
# It is recommended to create new virtual environment for each Kedro project #
##############################################################################

# Uncomment and run the line below if your environment does't have
# Kedro or any other dependencies needed.

#! pip install -r requirements.txt
%load_ext kedro.ipython


----

# Baseline model
TODO Remove
As the dataset needs transformation like imputation and normalization, for avoiding data leakage, all transformations will be done within model pipeline and fitting only on training data on model fitting stage. So I'm going to split initial typed dataset to train/test sets and balance train set, ignoring all previously transformed datasets.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import logging
from lending_club.pipelines.analysis.nodes import features_eng
from lending_club.pipelines.encode.nodes import _default_status
from imblearn.pipeline import make_pipeline as imb_make_pipeline

logger = logging.getLogger(__name__)

def split_dataset(df: pd.DataFrame, params: dict):
    y = _default_status(df, params)
    X = df
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=params['test_size'],
        random_state=params['random_state']
    )
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, regressor, params: dict):
    try:
        regressor.set_params(**params['fit_options']).fit(X_train, y_train)
    except:
        regressor.fit(X_train, y_train)
    return regressor


def model_pipeline(model_options: dict, params: dict):

    # split important features to assign preprocessing steps
    category_feat = [f for f in (params['category'] + [params['emp_len']]) if f in params['model_features']]
    numeric_feat_zero = [f for f in params['fill_zero'] if f in params['model_features']]
    numeric_feat_med = [f for f in params['fill_med'] if f in params['model_features']]

    # transformer to replace missing numeric values by 0
    # and standartize all values
    numeric_feat_zero_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler()
    )
    # transformer to replace missing numeric values by median
    numeric_feat_med_transformer = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )

    # assemble transformers in preprocessing pipe so it will perform
    # following transformations:
    #   - encode all categorical features to numbers
    #   - fill missing values in specific number features as "0" and standartize them
    #   - fill missing values in specific number features as median and standartize them
    preprocessing = make_column_transformer(
        (OrdinalEncoder(), category_feat),
        (numeric_feat_zero_transformer, numeric_feat_zero),
        (numeric_feat_med_transformer, numeric_feat_med)
    )

    # choose regressor depending on provided model_options
    if model_options['name'] == 'rfc':
        regressor = RandomForestClassifier(**model_options['regressor_options'])
    else:
        if model_options['name'] == 'catboost':
            regressor = CatBoostClassifier(**model_options['regressor_options'])
        else:
            raise Exception("Pipeline accepts only RandomForestClassifier and CatBoostClassifier")

    # assebmle preprocessing pipeline, SMOTE (ibmalance handler) and
    # choosen regressor as the model pipeline
    model = imb_make_pipeline(
        preprocessing,
        SMOTE(random_state=params['random_state']),
        regressor
    )
    return model


In [3]:

# Load parameters
params = catalog.load("parameters")
params['model_features'] = catalog.load("params:model_features")
params['model_options'] = catalog.load("params:baseline_model.model_options")


# catalog.datasets

In [4]:
df = catalog.load("intermediate_lc_dataset")
features_eng(df, params)
X_train, X_test, y_train, y_test = split_dataset(df, params)

In [5]:
# Make a model an fit it
model = model_pipeline(params['model_options'], params)
model.fit(X_train, y_train)
model

# Evaluation

To evaluate model performance I want to use custom loss function, so I need to calculate actual earning rate to define potential losses in case we refuse in loan, that was mistakenly predicted as default, as well as actual losses for charged off loans, that will be our loss in case if we issue a loan that was mistakenly predicted as non default. 

I assume, that earning rate for non defaulted loans, considering loans that is not fully paid at the moment, is total received amount less than total received principal divided by total received principal. 

For charged off loan, I believe, the actual losses are amount of loan less than total received payments (that includes collections after charges off) plus collection recovery fee (that I believe is our payment to collectors for collection services). Dividing that by this category loan amount we can get actual loss rate for defaulted loans

In [6]:
# Function that returns actual profit/loss rates for non-defartet/defaulted loans
def get_loss_values(df: pd.DataFrame) -> pd.DataFrame:

    # Select columns for profit/loss calculation
    df = df.loc[:, ['loan_amnt', 'loan_status', 'total_pymnt', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee']]

    # Add default status and summarize data
    df['default_status'] = df['loan_status'].str.contains("Charged Off", regex=False, na=False)
    df = df.drop(columns=['loan_status'])
    df=df.groupby(by='default_status').sum()
    df = df.reset_index()


    df['earning/loss'] = (
        # actual earnings rate for non-defaulters
        ((df.total_pymnt - df.total_rec_prncp) / df.total_rec_prncp) * ~df.default_status
        # actual losses rate for defaulters
        + (df.loan_amnt - df.total_pymnt + df.collection_recovery_fee) / df.loan_amnt * df.default_status
        )

    # Select columns: 'default_status', 'earning/loss'
    df = df.loc[:, ['default_status', 'earning/loss']]
    return df.set_index('default_status')

df_loss = get_loss_values(catalog.load('intermediate_lc_clean'))
df_loss

Unnamed: 0_level_0,earning/loss
default_status,Unnamed: 1_level_1
False,0.228696
True,0.460871


>These figures will be used in parameters...yml to feed FP_cost and FN_cost to model evaluator

In [7]:
params['FP_cost'] = df_loss['earning/loss'].loc[False]
params['FN_cost'] = df_loss['earning/loss'].loc[True]

In [8]:
def make_rng(start, stop, step):
    return range(start, stop, step)


def evaluate_metrics(model: object, X_true, y_true,
                     params: dict) -> pd.DataFrame:
    y_pred_proba = model.predict_proba(X_true)
    metrics = pd.DataFrame()
    for thresh in make_rng(**params['model_options']['prob_threshold']):
        y_pred = (y_pred_proba[:,1] > (thresh / 100))
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        cur_metrics = pd.DataFrame(
        data={
            'prob_thresh_%': thresh,
            'accuracy'     : accuracy_score(y_true, y_pred),
            'precision'    : precision_score(y_true, y_pred),
            'recall'       : recall_score(y_true, y_pred),
            'f1'           : f1_score(y_true, y_pred),
            'roc_auc'      : roc_auc_score(y_true, y_pred),
            'tn'           : tn,
            'fp'           : fp,
            'fn'           : fn,
            'tp'           : tp,
            'loss'         : params['FP_cost'] * fp + params['FN_cost'] *fn,
        },
        index = [params['model_options']['name']]
        )
        metrics = pd.concat([metrics, cur_metrics], axis=0)
        logger.info(f"The best probability threshold for {params['model_options']['name']} model based on min loss: {metrics[metrics.loss==metrics.loss.min()]['prob_thresh_%'].iloc[0]}")
    return metrics

eval_metr = evaluate_metrics(model, X_test, y_test, params)

eval_metr

Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
rfc,30,0.78045,0.244555,0.381342,0.298002,0.608676,14677,2879,1512,932,1355.251005
rfc,31,0.7892,0.250703,0.364566,0.297099,0.60644,14893,2663,1553,891,1324.748479
rfc,32,0.7964,0.253632,0.342881,0.29158,0.601208,15090,2466,1606,838,1304.121617
rfc,33,0.8032,0.257635,0.324468,0.287215,0.597157,15271,2285,1651,793,1283.466917
rfc,34,0.80825,0.259092,0.306056,0.280623,0.592108,15417,2139,1696,748,1270.816559
rfc,35,0.815,0.266889,0.29419,0.279875,0.590846,15581,1975,1725,719,1246.67575
rfc,36,0.8209,0.271486,0.276596,0.274017,0.586635,15742,1814,1768,676,1229.673218
rfc,37,0.8259,0.274935,0.259411,0.266947,0.582086,15884,1672,1810,634,1216.55503
rfc,38,0.83135,0.281822,0.245499,0.26241,0.579203,16027,1529,1844,600,1199.52118
rfc,39,0.83555,0.286939,0.232815,0.257059,0.576136,16142,1414,1875,569,1187.508192


In [9]:
print(f"The best probability threshold based on min loss: {eval_metr[eval_metr.loss==eval_metr.loss.min()]['prob_thresh_%'].iloc[0]}")
eval_metr[eval_metr.loss==eval_metr.loss.min()]

The best probability threshold based on min loss: 61


Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
rfc,61,0.87455,0.396825,0.051146,0.090613,0.520162,17366,190,2319,125,1112.21153


In [24]:
%reload_kedro
params['model_options'] = catalog.load("params:candidate_model.model_options")

# Make a model an fit it
model = model_pipeline(params['model_options'], params)
model.fit(X_train, y_train)
model


0:	learn: 0.7238578	test: 0.7277791	best: 0.7277791 (0)	total: 58.4ms	remaining: 29.2s
100:	learn: 0.9679889	test: 0.9695254	best: 0.9695254 (100)	total: 6.41s	remaining: 25.3s
200:	learn: 0.9872888	test: 0.9870054	best: 0.9870054 (200)	total: 10.4s	remaining: 15.5s
300:	learn: 0.9913214	test: 0.9906550	best: 0.9906550 (300)	total: 13.8s	remaining: 9.13s
400:	learn: 0.9931161	test: 0.9915071	best: 0.9916715 (397)	total: 17.7s	remaining: 4.37s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9916715249
bestIteration = 397

Shrink model to first 398 iterations.


In [25]:

eval_metr = evaluate_metrics(model, X_test, y_test, params)

eval_metr

Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
catboost,30,0.84245,0.320467,0.258183,0.285973,0.590985,16218,1338,1813,631,1141.553344
catboost,31,0.84615,0.32348,0.237316,0.273779,0.584111,16343,1213,1864,580,1136.470817
catboost,32,0.8498,0.328431,0.219313,0.263003,0.578442,16460,1096,1908,536,1129.991758
catboost,33,0.85225,0.329325,0.201718,0.25019,0.572265,16552,1004,1951,493,1128.769216
catboost,34,0.85465,0.330403,0.184534,0.236808,0.566236,16642,914,1993,451,1127.543194
catboost,35,0.8581,0.340872,0.172668,0.229223,0.563094,16740,816,2022,422,1118.496288
catboost,36,0.86065,0.346464,0.158347,0.217355,0.558383,16826,730,2057,387,1114.958953
catboost,37,0.86265,0.349553,0.144026,0.203999,0.553359,16901,655,2092,352,1113.937268
catboost,38,0.86395,0.351554,0.134206,0.194255,0.549873,16951,605,2116,328,1113.563392
catboost,39,0.8655,0.355634,0.123977,0.183859,0.546353,17007,549,2141,303,1112.278214


In [20]:
cb = CatBoostClassifier(**params['model_options']['regressor_options'], cat_features=(params['category']+[params['emp_len']]))