In [None]:
import pandas as pd
import plotly.express as px

from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_selection import SelectKBest, f_classif, chi2
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from tqdm.notebook import tqdm

In [None]:
data = pd.read_csv('data/clean_data.csv')

In [None]:
data.head()

## imbalanced data

In [None]:
df = data[['LOAN_DEFAULT', 'UNIQUEID']].groupby('LOAN_DEFAULT').count().reset_index()
df

In [None]:
fig = px.bar(df, x='LOAN_DEFAULT', y='UNIQUEID')
fig.show()

## feature selection

In [None]:
data.dtypes

In [None]:
num_cols = [
    'DISBURSAL_AGE',
    'APPLICANTS_AGE',
    'CREDIT_HISTORY_LENGTH',
    'AVERAGE_ACCT_AGE',
    'PRI_SANCTIONED_AMOUNT',
    'PRI_CURRENT_BALANCE',
    'PRI_NO_OF_ACCTS',
    'PERFORM_CNS_SCORE',
    'LTV',
    'ASSET_COST',
    'DISBURSED_AMOUNT',
]
len(num_cols)

In [None]:
X_num = data[num_cols]
y = data['LOAN_DEFAULT']

In [None]:
free_cols = [
    'DISBURSAL_DATE',
    'LOAN_DEFAULT',
    'UNIQUEID'
]

In [None]:
X_cat = data.drop(num_cols+free_cols, axis = 1)

In [None]:
cat_cols = list(X_cat.columns)

In [None]:
len(cat_cols)

## model training

In [None]:
def cal_metrices(y_pred, y_test):
    return [
        accuracy_score(y_pred, y_test),
        f1_score(y_pred, y_test)
    ]

In [None]:
MODELS_PERFORMANCES = []

In [None]:
RANDOM_SAMPLER = {'rus':RandomUnderSampler(random_state=1), 'sm':SMOTE(random_state=1), 'rs':False}

In [None]:
def model_training(k_num, k_cat, rs):
    # feature selection
    fs_num = SelectKBest(score_func=f_classif, k=k_num) # anova
    X_num_selected = fs_num.fit_transform(X_num, y)
    
    fs_cat = SelectKBest(score_func=chi2, k=k_cat)
    X_cat_selected = fs_cat.fit_transform(X_cat, y)
    
    # split data into training and testing sets
    train_data_selected = np.concatenate((X_num_selected, X_cat_selected), axis = 1)
    x_train, x_test, y_train, y_test = train_test_split(train_data_selected, y, random_state=7)
    
    if RANDOM_SAMPLER[rs]:
        x_train_res, y_train_res = RANDOM_SAMPLER[rs].fit_resample(x_train, y_train)
    else:
        rs = np.nan
        x_train_res, y_train_res = x_train, y_train
    # models
    # logistic regression
    lr = LogisticRegression(max_iter=10000, random_state=0).fit(x_train_res, y_train_res)
    MODELS_PERFORMANCES.append(cal_metrices(lr.predict(x_test), y_test)+[k_num, k_cat, 'lr', rs])
    # random forest
    rf = RandomForestClassifier(max_depth=2, random_state=0).fit(x_train_res, y_train_res)
    MODELS_PERFORMANCES.append(cal_metrices(rf.predict(x_test), y_test)+[k_num, k_cat, 'rf', rs])
    # gradient boosting machine
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                     max_depth=1, random_state=0).fit(x_train_res, y_train_res)
    
    MODELS_PERFORMANCES.append(cal_metrices(gb.predict(x_test), y_test)+[k_num, k_cat, 'gb', rs])

In [None]:
col_num_rs = [ (k_num, k_cat, rs)
    for k_num in range(2, len(num_cols))
    for k_cat in range(2, len(cat_cols))
    for rs in RANDOM_SAMPLER.keys()
]

In [None]:
for k_num, k_cat, rs in tqdm(col_num_rs):
    model_training(k_num, k_cat, rs)

## hyper parameter tunning of gradient boosting machine

In [None]:
# feature selection
# chi square feature selection for numeric input and categorical output
fs_num = SelectKBest(score_func=f_classif, k=4)
X_num_selected = fs_num.fit_transform(X_num, y)
# ANOVA feature selection for numeric input and categorical output
fs_cat = SelectKBest(score_func=chi2, k=13)
X_cat_selected = fs_cat.fit_transform(X_cat, y)

# data splitting and sampling
# split the data into trainin and testing sets
train = np.concatenate((X_num_selected, X_cat_selected), axis=1)
x_train, x_test, y_train, y_test = train_test_split(train, y, random_state=3)

# solving the imbalanced problem
x_train_res, y_train_res = x_train, y_train #RANDOM_SAMPLER['rus'].fit_resample(x_train, y_train) 


In [None]:
estimators = [27, 53, 101, 213, 511]
depths = [1, 3, 5, 7, 9]

items = [(e,d) for e in estimators for d in depths]

In [None]:
f1_acc=[]

In [None]:
for e, d in tqdm(items):
    gb = GradientBoostingClassifier(n_estimators=e, learning_rate=1.0, subsample=1,
                                     max_depth=d, random_state=0).fit(x_train_res, y_train_res)
    f1_acc.append(cal_metrices(gb.predict(x_test), y_test)+[e, d])


In [None]:
df = pd.DataFrame(f1_acc, columns=['acc', 'f1', 'est', 'depth'])
df = df[df['acc']>=0.7]
df.sort_values(by='f1', ascending=False)

In [None]:
model = GradientBoostingClassifier(n_estimators=213, learning_rate=1.0, subsample=1,
                                     max_depth=7, random_state=0).fit(x_train_res, y_train_res)

In [None]:
## Model saving

In [None]:
from joblib import dump, load
dump(model, f'gradient_boosting_loan_model_{datetime.now()}.joblib')

In [None]:
model = load('gradient_boosting_loan_model_2022-11-27 23:47:26.313148.joblib')