## **Import Libraries** 

In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.impute import SimpleImputer

import os  
import gc

from google.colab import drive

## **Functions**

In [2]:
def reduce_memory_usage(df):
    '''
        This function reduces the memory usage of the dataframes '''
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## **Reading Data** 

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
df = reduce_memory_usage(pd.read_csv('/content/gdrive/MyDrive/Home_Credit_Default_Risk/df_cleaned.csv'))
df.head()

Memory usage of dataframe is 478.59 MB
Memory usage after optimization is: 147.21 MB
Decreased by 69.2%


Unnamed: 0,index,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CLOSED_AMT_CREDIT_SUM_DEBT_MEAN,CLOSED_AMT_CREDIT_SUM_DEBT_SUM,CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN,CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN,CLOSED_AMT_CREDIT_SUM_LIMIT_SUM,CLOSED_CNT_CREDIT_PROLONG_SUM,CLOSED_MONTHS_BALANCE_MIN_MIN,CLOSED_MONTHS_BALANCE_MAX_MAX,CLOSED_MONTHS_BALANCE_SIZE_MEAN,CLOSED_MONTHS_BALANCE_SIZE_SUM
0,0.0,100002.0,1.0,0.0,0.0,0.0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-47.0,0.0,15.0,90.0
1,1.0,100003.0,0.0,0.0,1.0,0.0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-59.96875,-3.605469,33.46875,0.0
2,2.0,100004.0,0.0,1.0,0.0,1.0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-59.96875,-3.605469,33.46875,0.0
3,3.0,100006.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,...,2739.62793,9328.423828,3.859963,1089.433228,3677.14209,0.017029,-59.96875,-3.605469,33.46875,48.28125
4,4.0,100007.0,0.0,0.0,0.0,0.0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-59.96875,-3.605469,33.46875,0.0


## **Metric Selection** 

**ROC_AUC Score**: 

In this particular Kaggle Competition, note that the ROC_AUC Score is the performance metric that has been chosen to be optimised by the organisers. An ROC Curve is insensitive to class imbalance and the ROC Curve is the most commonly used method to visualise the performance of a Binary Classifier (plotting for the True Positive Rate and False Positive Rate Values for a particular threshold). The ROC_AUC Score is one of the best ways to summarise the model performance in a single number. The higher this score, better is the model performance. 

## **Model Selection** 

## **Model Training** 

### **1. LightGBM Model**

In [7]:
# Parameters from Tilii kernel: 
def kfold_lightgbm(df, num_folds, stratified = True):
    # Divide in training/validation and test data
    train_df = df.iloc[0:300000]
    test_df = df.iloc[300000:]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    display_importances(feature_importance_df)
    return feature_importance_df

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [8]:
feat_importance = kfold_lightgbm(df, num_folds= 5, stratified= True)

Starting LightGBM. Train shape: (300000, 204), test shape: (7501, 204)
Training until validation scores don't improve for 200 rounds.
[200]	training's binary_logloss: 0.239358	training's auc: 0.782134	valid_1's binary_logloss: 0.244746	valid_1's auc: 0.762404
[400]	training's binary_logloss: 0.231684	training's auc: 0.803516	valid_1's binary_logloss: 0.241833	valid_1's auc: 0.771227
[600]	training's binary_logloss: 0.226339	training's auc: 0.818476	valid_1's binary_logloss: 0.240621	valid_1's auc: 0.775026
[800]	training's binary_logloss: 0.222103	training's auc: 0.830138	valid_1's binary_logloss: 0.240184	valid_1's auc: 0.776288
[1000]	training's binary_logloss: 0.218126	training's auc: 0.840795	valid_1's binary_logloss: 0.239901	valid_1's auc: 0.777192
[1200]	training's binary_logloss: 0.214292	training's auc: 0.850449	valid_1's binary_logloss: 0.239727	valid_1's auc: 0.777933
[1400]	training's binary_logloss: 0.210791	training's auc: 0.859133	valid_1's binary_logloss: 0.239709	valid

NameError: ignored

In [None]:
feat_importance

## **Model Tuning** 

## **Model Saving** 

In [None]:
pickle.dump(model_1, open('/content/gdrive/MyDrive/Home_Credit_Default_Risk/clf.pkl', 'wb'))

## **Testing Data Preprocessing** 

## **Inference**

In [None]:
pickled_model_1 = pickle.load(open('/content/gdrive/MyDrive/Home_Credit_Default_Risk/model_1.pkl', 'rb'))

In [None]:
pickled_model_1.predict(X_test)

## **Insights and Final Thoughts** 

## **Future Work** 

## **References**

[1] M. Qamruzzaman and W. Jianguo, “Financial
innovation and economic growth in Bangladesh,”
Financ. Innov., 2017, doi: 10.1186/s408540-0070-017-.

[2] F. D. I. Corporation, “2017 FDIC national survey
of unbanked and underbanked households.” Federal
Deposit Insurance Corporation Washington, DC, 2018.

[3] M. Alam, “Risk prediction of loan default using
knowledge graph,” 2022.

[4] J. P. Simon, “Artificial intelligence: scope, players,
markets and geography,” Digit. Policy, Regul. Gov. ,2019, doi: 10.1108/DPRG-080039-2018-.

[5] C. Janiesch, P. Zschech, and K. Heinrich, “Machine
learning and deep learning,” Electron. Mark., 2021, doi:
10.1007/s125252-00475-021-.

[6] U. Kose, “Using artificial intelligence techniques
for economic time series prediction,” in Contemporary
Studies in Economic and Financial Analysis, 2019. doi:
10.1108/S1569375920190000101002-.

[7] K. Mungai and A. Bayat, “The impact of big data on
the South African banking industry,” 2018.

[8] V. Dhar, “Data science and prediction,” Commun.
ACM, 2013, doi: 10.11452500499/.

[9] A. Nabil, “Data Science in FinTech: credit risk
prediction using Deep Learning,” ETSI_Informatica,
2020.

[10] Home Credit Group, “Home Credit Default Risk
DataSet,” Kaggle, 2018.

[11] P. Beck, “Predicting Loan Default Likelihood Using
Machine Learning,” 2021.

[12] Y. E. Gundogmus, M. Nuhuz, and M. Tez, “Riskbased
Fraud Analysis for Bank Loans with Autonomous
Machine Learning,” in y-BIS 2019 Conference Book:
Recent Advances n Data Sc ence and Bus ness Analyt
cs, 2019, p. 143.

[13] X. Chen, X. Liu, Z. Liu, P. Song, and M. Zhong,
“A deep learning approach using DeepGBM for credit
assessment,” 2019. doi: 10.11453366194.3366333/.

[14] Dall›asta Rigo, Elif Yağmur. Evaluation of stacking
for predicting credit risk scores. Diss. Applied Data
Science, 2020.

[15] Y. Tounsi, H. Anoun, and L. Hassouni, “CSMAS:
Improving Multi-Agent Credit Scoring System
by Integrating Big Data and the new generation
of Gradient Boosting Algorithms,” 2020. doi:
10.11453386723.3387851/.

[16] G. Ke et al., “LightGBM: A highly efficient gradient
boosting decision tree,” 2017.

[17] C. Egan, “Improving Credit Default Prediction
Using Explainable AI,” Dublin, National College of
Ireland, 2021.

[18] Rodríguez P, Bautista MA, Gonzalez J, Escalera
S. Beyond one-hot encoding: Lower dimensional target
embedding. Image and Vision Computing. 2018 Jul
1;75:2131-.

[19] Boubiche S, Boubiche DE, Bilami A, Toral-Cruz H.
Big data challenges and data aggregation strategies
in wireless sensor networks. IEEE access. 2018 May
3;6:2055871-.

[20] Fushiki T. Estimation of prediction error by using
K-fold cross-validation. Statistics and Computing.
2011 Apr;21(2):13746-.

https://www.kaggle.com/competitions/home-credit-default-risk/code 

https://www.kaggle.com/competitions/home-credit-default-risk/overview

https://www.kaggle.com/competitions/home-credit-default-risk/data

https://www.kaggle.com/rinnqd/reduce-memory-usage

https://academic.oup.com/bioinformatics/article/26/10/1340/193348

https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code