# INFO

This notebook will be used for construction and testing purposes while designing model within Kedro framework. 

In [1]:
##############################################################################
# It is recommended to create new virtual environment for each Kedro project #
##############################################################################

# Uncomment and run the line below if your environment does't have
# Kedro or any other dependencies needed.

#! pip install -r requirements.txt
%load_ext kedro.ipython


----

# Baseline model
TODO Remove
As the dataset needs transformation like imputation and normalization, for avoiding data leakage, all transformations will be done within model pipeline and fitting only on training data on model fitting stage. So I'm going to split initial typed dataset to train/test sets and balance train set, ignoring all previously transformed datasets.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import logging
from lending_club.pipelines.analysis.nodes import features_eng
from lending_club.pipelines.encode.nodes import _default_status
from imblearn.pipeline import make_pipeline as imb_make_pipeline

logger = logging.getLogger(__name__)

def split_dataset(df: pd.DataFrame, params: dict):
    y = _default_status(df, params)
    X = df
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=params['test_size'],
        random_state=params['random_state']
    )
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, regressor, params: dict):
    try:
        regressor.set_params(**params['fit_options']).fit(X_train, y_train)
    except:
        regressor.fit(X_train, y_train)
    return regressor


def model_pipeline(model_options: dict, params: dict):

    # split important features to assign preprocessing steps
    category_feat = [f for f in (params['category'] + [params['emp_len']]) if f in params['model_features']]
    numeric_feat_zero = [f for f in params['fill_zero'] if f in params['model_features']]
    numeric_feat_med = [f for f in params['fill_med'] if f in params['model_features']]

    # transformer to replace missing numeric values by 0
    # and standartize all values
    numeric_feat_zero_transformer = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler()
    )
    # transformer to replace missing numeric values by median
    numeric_feat_med_transformer = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )

    # assemble transformers in preprocessing pipe so it will perform
    # following transformations:
    #   - encode all categorical features to numbers
    #   - fill missing values in specific number features as "0" and standartize them
    #   - fill missing values in specific number features as median and standartize them
    preprocessing = make_column_transformer(
        (OrdinalEncoder(), category_feat),
        (numeric_feat_zero_transformer, numeric_feat_zero),
        (numeric_feat_med_transformer, numeric_feat_med)
    )

    # choose regressor depending on provided model_options
    if model_options['name'] == 'rfc':
        regressor = RandomForestClassifier(**model_options['regressor_options'])
    else:
        if model_options['name'] == 'catboost':
            regressor = CatBoostClassifier(**model_options['regressor_options'])
        else:
            raise Exception("Pipeline accepts only RandomForestClassifier and CatBoostClassifier")

    # assebmle preprocessing pipeline, SMOTE (ibmalance handler) and
    # choosen regressor as the model pipeline
    model = imb_make_pipeline(
        preprocessing,
        SMOTE(random_state=params['random_state']),
        regressor
    )
    return model


In [3]:

# Load parameters
params = catalog.load("parameters")
params['model_features'] = catalog.load("params:model_features")
params['model_options'] = catalog.load("params:baseline_model.model_options")


# catalog.datasets

In [4]:
df = catalog.load("intermediate_lc_dataset")
features_eng(df, params)
X_train, X_test, y_train, y_test = split_dataset(df, params)

In [5]:
# Make a model an fit it
model = model_pipeline(params['model_options'], params)
model.fit(X_train, y_train)
model

# Evaluation

To evaluate model performance I want to use custom loss function, so I need to calculate actual earning rate to define potential losses in case we refuse in loan, that was mistakenly predicted as default, as well as actual losses for charged off loans, that will be our loss in case if we issue a loan that was mistakenly predicted as non default. 

I assume, that earning rate for non defaulted loans, considering loans that is not fully paid at the moment, is total received amount less than total received principal divided by total received principal. 

For charged off loan, I believe, the actual losses are amount of loan less than total received payments (that includes collections after charges off) plus collection recovery fee (that I believe is our payment to collectors for collection services). Dividing that by this category loan amount we can get actual loss rate for defaulted loans

In [6]:
# Function that returns actual profit/loss rates for non-defartet/defaulted loans
def get_loss_values(df: pd.DataFrame) -> pd.DataFrame:

    # Select columns for profit/loss calculation
    df = df.loc[:, ['loan_amnt', 'loan_status', 'total_pymnt', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee']]

    # Add default status and summarize data
    df['default_status'] = df['loan_status'].str.contains("Charged Off", regex=False, na=False)
    df = df.drop(columns=['loan_status'])
    df=df.groupby(by='default_status').sum()
    df = df.reset_index()


    df['earning/loss'] = (
        # actual earnings rate for non-defaulters
        ((df.total_pymnt - df.total_rec_prncp) / df.total_rec_prncp) * ~df.default_status
        # actual losses rate for defaulters
        + (df.loan_amnt - df.total_pymnt + df.collection_recovery_fee) / df.loan_amnt * df.default_status
        )

    # Select columns: 'default_status', 'earning/loss'
    df = df.loc[:, ['default_status', 'earning/loss']]
    return df.set_index('default_status')

df_loss = get_loss_values(catalog.load('intermediate_lc_clean'))
df_loss

Unnamed: 0_level_0,earning/loss
default_status,Unnamed: 1_level_1
False,0.228696
True,0.460871


>These figures will be used in parameters...yml to feed FP_cost and FN_cost to model evaluator

In [7]:
params['FP_cost'] = df_loss['earning/loss'].loc[False]
params['FN_cost'] = df_loss['earning/loss'].loc[True]

In [8]:
def make_rng(start, stop, step):
    return range(start, stop, step)


def evaluate_metrics(model: object, X_true, y_true,
                     params: dict) -> pd.DataFrame:
    y_pred_proba = model.predict_proba(X_true)
    metrics = pd.DataFrame()
    for thresh in make_rng(**params['model_options']['prob_threshold']):
        y_pred = (y_pred_proba[:,1] > (thresh / 100))
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        cur_metrics = pd.DataFrame(
        data={
            'prob_thresh_%': thresh,
            'accuracy'     : accuracy_score(y_true, y_pred),
            'precision'    : precision_score(y_true, y_pred),
            'recall'       : recall_score(y_true, y_pred),
            'f1'           : f1_score(y_true, y_pred),
            'roc_auc'      : roc_auc_score(y_true, y_pred),
            'tn'           : tn,
            'fp'           : fp,
            'fn'           : fn,
            'tp'           : tp,
            'loss'         : params['FP_cost'] * fp + params['FN_cost'] *fn,
        },
        index = [params['model_options']['name']]
        )
        metrics = pd.concat([metrics, cur_metrics], axis=0)
        logger.info(f"The best probability threshold for {params['model_options']['name']} model based on min loss: {metrics[metrics.loss==metrics.loss.min()]['prob_thresh_%'].iloc[0]}")
    return metrics

eval_metr = evaluate_metrics(model, X_test, y_test, params)

eval_metr

Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
rfc,30,0.78045,0.244555,0.381342,0.298002,0.608676,14677,2879,1512,932,1355.251005
rfc,31,0.7892,0.250703,0.364566,0.297099,0.60644,14893,2663,1553,891,1324.748479
rfc,32,0.7964,0.253632,0.342881,0.29158,0.601208,15090,2466,1606,838,1304.121617
rfc,33,0.8032,0.257635,0.324468,0.287215,0.597157,15271,2285,1651,793,1283.466917
rfc,34,0.80825,0.259092,0.306056,0.280623,0.592108,15417,2139,1696,748,1270.816559
rfc,35,0.815,0.266889,0.29419,0.279875,0.590846,15581,1975,1725,719,1246.67575
rfc,36,0.8209,0.271486,0.276596,0.274017,0.586635,15742,1814,1768,676,1229.673218
rfc,37,0.8259,0.274935,0.259411,0.266947,0.582086,15884,1672,1810,634,1216.55503
rfc,38,0.83135,0.281822,0.245499,0.26241,0.579203,16027,1529,1844,600,1199.52118
rfc,39,0.83555,0.286939,0.232815,0.257059,0.576136,16142,1414,1875,569,1187.508192


In [9]:
print(f"The best probability threshold based on min loss: {eval_metr[eval_metr.loss==eval_metr.loss.min()]['prob_thresh_%'].iloc[0]}")
eval_metr[eval_metr.loss==eval_metr.loss.min()]

The best probability threshold based on min loss: 61


Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
rfc,61,0.87455,0.396825,0.051146,0.090613,0.520162,17366,190,2319,125,1112.21153


In [10]:
%reload_kedro
params['model_options'] = catalog.load("params:candidate_model.model_options")

# Make a model an fit it
model = model_pipeline(params['model_options'], params)
model.fit(X_train, y_train)
model


0:	learn: 0.8037460	test: 0.8047763	best: 0.8047763 (0)	total: 99.2ms	remaining: 1m 39s
100:	learn: 0.9645390	test: 0.9654261	best: 0.9654261 (100)	total: 4.18s	remaining: 37.2s
200:	learn: 0.9695223	test: 0.9694964	best: 0.9694964 (200)	total: 8.16s	remaining: 32.5s
300:	learn: 0.9711634	test: 0.9705056	best: 0.9705056 (300)	total: 12.1s	remaining: 28s
400:	learn: 0.9723882	test: 0.9709697	best: 0.9709723 (399)	total: 16s	remaining: 23.9s
500:	learn: 0.9734699	test: 0.9711312	best: 0.9711312 (500)	total: 19.8s	remaining: 19.7s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9712140256
bestIteration = 557

Shrink model to first 558 iterations.


In [11]:

eval_metr = evaluate_metrics(model, X_test, y_test, params)

eval_metr

Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
catboost,30,0.8427,0.315457,0.245499,0.276116,0.585668,16254,1302,1844,600,1147.607301
catboost,31,0.84685,0.321408,0.227905,0.266699,0.58046,16380,1176,1887,557,1138.609111
catboost,32,0.85035,0.326156,0.21072,0.256028,0.575057,16492,1064,1929,515,1132.351789
catboost,33,0.8541,0.333801,0.194763,0.245995,0.570325,16606,950,1968,476,1124.254462
catboost,34,0.8569,0.339724,0.18126,0.236393,0.566109,16695,861,2001,443,1119.109299
catboost,35,0.8585,0.336717,0.162848,0.219526,0.559095,16772,784,2046,398,1122.238931
catboost,36,0.8616,0.34717,0.150573,0.210046,0.555578,16864,692,2076,368,1115.025069
catboost,37,0.86385,0.356037,0.141162,0.202168,0.552809,16932,624,2099,345,1110.073803
catboost,38,0.8651,0.358575,0.131751,0.192699,0.549471,16980,576,2122,322,1109.696447
catboost,39,0.8662,0.358191,0.119885,0.179644,0.544991,17031,525,2151,293,1111.398229


In [12]:
print(f"The best probability threshold based on min loss: {eval_metr[eval_metr.loss==eval_metr.loss.min()]['prob_thresh_%'].iloc[0]}")
eval_metr[eval_metr.loss==eval_metr.loss.min()]

The best probability threshold based on min loss: 38


Unnamed: 0,prob_thresh_%,accuracy,precision,recall,f1,roc_auc,tn,fp,fn,tp,loss
catboost,38,0.8651,0.358575,0.131751,0.192699,0.549471,16980,576,2122,322,1109.696447


> Catboost model with probability threshold 38% is better than RandomForest as it has lower losses.

# Model impact on business

## Outcome without a model

In [37]:
W = df.loan_amnt.sum()
print(f"Total loan amount issued: ${W:,}")

X = df.default_status.value_counts().loc[True]/df.shape[0]*100
print(f"% of loan default: {X:.2f}%")

Y = df.loan_amnt.mean()
print(f"Average loan amount issued: ${Y:,.2f}")

Z = df.int_rate.mean()
print(f"Average interest rate: {Z:.2f}%")

Q = df.loan_amnt.sum() * df.int_rate.mean()/100
print(f"Interest income (assuming 100% repayment): ${Q:,.2f}")

R = X/100 * W * df_loss['earning/loss'].loc[True]
print(f"Losses from default: ${R:,.2f}")

print(f"Net income interest without model: ${Q - R:,.2f}")

Total loan amount issued: $1,533,237,300.0
% of loan default: 12.43%
Average loan amount issued: $15,332.37
Average interest rate: 13.06%
Interest income (assuming 100% repayment): $200,190,767.98
Losses from default: $87,840,466.78
Net income interest without model: $112,350,301.20


## Outcomes with the model

In [49]:
best_model = eval_metr[eval_metr.loss==eval_metr.loss.min()]
J = (best_model['tp'] / (best_model['tp'] + best_model['fn'])).sum()
print(f"% of correctly predicted defaults: {J*100:.2f}%")

R_ = X/100 * (1 - J) * W * df_loss['earning/loss'].loc[True]

print(f"Losses from default: ${R_:,.2f}")

print(f"Net income interest without model: ${Q - R_:,.2f}")


% of correctly predicted defaults: 13.18%
Losses from default: $76,267,377.45
Net income interest without model: $123,923,390.53


> **CONCLUSIONS**

In [51]:
print(f"Reduce losses from defaults by: ${R -R_:,.2f}")
print(f"Reduce Non Performing Ratio from {R/W:.5f} to {R_/W:.5f}")

Reduce losses from defaults by: $11,573,089.32
Reduce Non Performing Ratio from 0.05729 to 0.04974
