In [65]:
## Loan neccessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier # type: ignore
from lightgbm import LGBMClassifier # type: ignore
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier # type: ignore
import bayes_opt
from scipy.stats import rankdata

In [3]:
warnings.filterwarnings("ignore")

In [4]:
## Load the dataset
for file in os.listdir():
    if file[-3:] == "csv":
        globals()[f"{file[:-4]}"] = pd.read_csv(f"./{file}")

In [5]:
#### Explore the data set

def explore(df, train=True):
    
    ## check the dataset shape
    print(f"The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} columns")
    
    ## check for dupicates
    print(f"\nThe dataset has {df.duplicated().sum():,} duplicate records\n")
    
    ## check for missing values
    print("Check for missing value")
    for col in df.columns:
        print(f"{col}: {df[col].isna().sum()} NAs")
        
    ## Display dataset statistics
    print("Dataset Statistics")
    print(f"{df.describe()}")
        
    ## Check the dataset info
    print("\nDataset information")
    print(f"{df.info()}")
    
    ## check class balance
    if train:
        plt.figure(figsize=(6,4))
        sns.countplot(df, x="loan_status")
        plt.title("Target Class Distribution")
        plt.xlabel("Target Class")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.show()
        

    
        
        

In [6]:
explore(test, train=False)

The dataset has 39,098 rows and 12 columns

The dataset has 0 duplicate records

Check for missing value
id: 0 NAs
person_age: 0 NAs
person_income: 0 NAs
person_home_ownership: 0 NAs
person_emp_length: 0 NAs
loan_intent: 0 NAs
loan_grade: 0 NAs
loan_amnt: 0 NAs
loan_int_rate: 0 NAs
loan_percent_income: 0 NAs
cb_person_default_on_file: 0 NAs
cb_person_cred_hist_length: 0 NAs
Dataset Statistics
                 id    person_age  person_income  person_emp_length  \
count  39098.000000  39098.000000   3.909800e+04       39098.000000   
mean   78193.500000     27.566781   6.406046e+04           4.687068   
std    11286.764749      6.032761   3.795583e+04           3.868395   
min    58645.000000     20.000000   4.000000e+03           0.000000   
25%    68419.250000     23.000000   4.200000e+04           2.000000   
50%    78193.500000     26.000000   5.800000e+04           4.000000   
75%    87967.750000     30.000000   7.588500e+04           7.000000   
max    97742.000000     94.000000   

In [7]:
###mPreprocessing

train.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [8]:
## Create custom class to drop column id
class drop_id(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = "id"
    def fit(self, x,y=None):
        """No fit because there is no learning"""
        return self
    def transform(self, x):
        x_new = x.copy()
        x_new.drop(columns= self.col, inplace= True)
        return x_new
    

## creat custom class to encode categorical variables
class encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = None
        self.non_numeric_cols = None
        
    def fit(self, x, y):
        self.non_numeric_cols = x.select_dtypes(exclude = [np.number]).columns
        self.encoder = ce.TargetEncoder(cols=self.non_numeric_cols)
        self.encoder.fit(x[self.non_numeric_cols], y)
        return self
    
    def transform(self, x):
        x_new = x.copy()
        x_new[self.non_numeric_cols] = self.encoder.transform(x_new[self.non_numeric_cols])
        return x_new

In [9]:
## split target and features
target = train[["loan_status"]]
features = train.drop(columns=["loan_status"])

In [10]:
## create preprocessing pipeline
preprocessing_pipeline = make_pipeline(drop_id(), encoder(),MinMaxScaler(),SMOTE())

## preprocess the trainset
x_train, y_train = preprocessing_pipeline.fit_resample(features,target)

## clean the test set
test_clean = pd.DataFrame(preprocessing_pipeline[:-1].transform(test), columns=test.columns[1:])

## convert output to dataframe
x_train = pd.DataFrame(x_train, columns=features.columns[1:])
y_train = pd.DataFrame(y_train, columns=target.columns)

In [11]:
x_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,0.165049,0.016246,1.0,0.0,0.15475,0.085097,0.15942,0.341011,0.204819,0.0,0.428571
1,0.019417,0.027324,0.0,0.04878,0.885385,0.138073,0.101449,0.445506,0.084337,0.0,0.0
2,0.087379,0.012976,0.0,0.065041,0.414333,0.0,0.15942,0.195506,0.253012,0.0,0.285714
3,0.097087,0.034708,1.0,0.113821,0.0,0.085097,0.333333,0.319663,0.204819,0.0,0.107143
4,0.019417,0.029433,1.0,0.01626,0.885385,0.0,0.15942,0.08427,0.120482,0.0,0.035714


In [12]:

## initialize some pararameters in Randomforest to optimize
criteria = ["gini", "entropy", "log_loss"]
maximum_features = ["sqrt", "log2"]

## set fold value
fold = 10

### create the objective function for random forest
def random_forest_clf(criterion, max_depth,max_features, max_samples ):
    
    roc_auc_scores = []
    
    ## create the parameter dictionary    
    params = {"n_estimators": 100,
              "criterion": criteria[int(round(criterion))],
              "max_depth": int(round(max_depth)),
              "max_features": maximum_features[int(round(max_features))],
              "random_state" : 42,
              "max_samples": max_samples,
              "n_jobs" :-1
              }
    
    ## using stratified-10-fold to train model
    skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
    ## loop through folds
    for train_idx,val_idx in skfold.split(x_train,y_train):
        train_x = x_train.iloc[train_idx]
        train_y = y_train.iloc[train_idx]
        val_x = x_train.iloc[val_idx]
        val_y = y_train.iloc[val_idx]
        
        model = RandomForestClassifier(**params)
        model.fit(train_x, train_y)
        pred_prob = model.predict_proba(val_x)[:,1]
        roc_auc = roc_auc_score(val_y, pred_prob)
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)
        

### set the parameter bound
pbound = {"criterion": (0,2),
          "max_depth": (5,25),
          "max_features": (0,1),
          "max_samples": (0.7, 1)}

## optimise the parameter
bayes_optm = bayes_opt.BayesianOptimization(f=random_forest_clf, pbounds=pbound,verbose=2,random_state=42 )
bayes_optm.maximize(init_points=20, n_iter=40)




        
        


|   iter    |  target   | criterion | max_depth | max_fe... | max_sa... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.9929   [39m | [39m0.7491   [39m | [39m24.01    [39m | [39m0.732    [39m | [39m0.8796   [39m |
| [39m2        [39m | [39m0.9566   [39m | [39m0.312    [39m | [39m8.12     [39m | [39m0.05808  [39m | [39m0.9599   [39m |
| [39m3        [39m | [39m0.9911   [39m | [39m1.202    [39m | [39m19.16    [39m | [39m0.02058  [39m | [39m0.991    [39m |
| [39m4        [39m | [39m0.9638   [39m | [39m1.665    [39m | [39m9.247    [39m | [39m0.1818   [39m | [39m0.755    [39m |
| [39m5        [39m | [39m0.9861   [39m | [39m0.6085   [39m | [39m15.5     [39m | [39m0.4319   [39m | [39m0.7874   [39m |
| [39m6        [39m | [39m0.9575   [39m | [39m1.224    [39m | [39m7.79     [39m | [39m0.2921   [39m | [39m0.8099   [39m |
| [39m7        [39m | [39m0.992    [39m | [

In [13]:
## fetch the best parameters
best_params_rf = bayes_optm.max["params"]

params_rf = {"n_estimators": 100,
              "criterion": criteria[int(round(best_params_rf["criterion"]))],
              "max_depth": int(round(best_params_rf["max_depth"])),
              "max_features": maximum_features[int(round(best_params_rf["max_features"]))],
              "random_state" : 42,
              "max_samples": best_params_rf["max_samples"],
              "n_jobs" :-1
              }

oof_train_rf = np.zeros(len(x_train))
rf_prediction = np.zeros(len(test_clean))
train_roc_auc_score_rf = []
val_roc_auc_score_rf = []


### build the model using the best parameters
## using stratified-10-fold to train model
skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
## loop through folds
for train_idx,val_idx in skfold.split(x_train,y_train):
    train_x = x_train.iloc[train_idx]
    train_y = y_train.iloc[train_idx]
    val_x = x_train.iloc[val_idx]
    val_y = y_train.iloc[val_idx]
    
    # fit model    
    model = RandomForestClassifier(**params_rf)
    model.fit(train_x, train_y)
    
    ## make prediction on validation set
    pred_xval = model.predict_proba(val_x)[:,1]
    val_roc = roc_auc_score(val_y, pred_xval)
    val_roc_auc_score_rf.append(val_roc)
    oof_train_rf[val_idx] = pred_xval
    
    ## make prediction on train set
    pred_xtrain = model.predict_proba(train_x)[:,1]
    train_roc = roc_auc_score(train_y, pred_xtrain)
    train_roc_auc_score_rf.append(train_roc)
    
    
    ## make prediction on test set    
    rf_prediction += model.predict_proba(test_clean)[:,1]
    
    




In [14]:
##Update the submission file target variable with the predictions
sample_submission.loan_status = rf_prediction/fold

##save the submission to file
sample_submission.to_csv("submission_rf.csv", index=False)

In [15]:
### Solution 2 using Xgboost

## initialize some pararameters in XGBM to optimize

## set fold value
fold = 10

### create the objective function for random forest
def xgbm_clf(learning_rate, max_depth, gamma, subsample, colsample_bytree,reg_alpha,reg_lambda  ):
    
    roc_auc_scores = []
    
    ## create the parameter dictionary    
    params = {"n_estimators": 100,
              "learning_rate": learning_rate,
              "max_depth": int(round(max_depth)),
              "gamma": gamma,
              "subsample" : subsample,
              "colsample_bytree": colsample_bytree,
              "reg_alpha": reg_alpha,
              "reg_lambda" : reg_lambda,
              "eval_metric" : "auc",
              "n_jobs" : -1,
              "early_stopping_rounds" : 50
              }
    
    ## using stratified-10-fold to train model
    skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
    ## loop through folds
    for train_idx,val_idx in skfold.split(x_train,y_train):
        train_x = x_train.iloc[train_idx]
        train_y = y_train.iloc[train_idx]
        val_x = x_train.iloc[val_idx]
        val_y = y_train.iloc[val_idx]
        
        model = XGBClassifier(**params)
        model.fit(train_x, train_y, eval_set = [(val_x,val_y)])
        pred_prob = model.predict_proba(val_x)[:,1]
        roc_auc = roc_auc_score(val_y, pred_prob)
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)
        

### set the parameter bound
pbound = {"learning_rate": (0.01,0.1),
          "max_depth": (5,25),
          "gamma" : (0,1),
          "subsample": (0.7,1),
          "colsample_bytree": (0.7,1),
          "reg_lambda": (0,1),
          "reg_alpha": (0, 1)}

## optimise the parameter
bayes_optm_xgbm = bayes_opt.BayesianOptimization(f=xgbm_clf, pbounds=pbound,verbose=2,random_state=42 )
bayes_optm_xgbm.maximize(init_points=20, n_iter=40)




        
        


|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
[0]	validation_0-auc:0.95798
[1]	validation_0-auc:0.97633
[2]	validation_0-auc:0.98017
[3]	validation_0-auc:0.98376
[4]	validation_0-auc:0.98423
[5]	validation_0-auc:0.98508
[6]	validation_0-auc:0.98592
[7]	validation_0-auc:0.98654
[8]	validation_0-auc:0.98688
[9]	validation_0-auc:0.98720
[10]	validation_0-auc:0.98769
[11]	validation_0-auc:0.98779
[12]	validation_0-auc:0.98856
[13]	validation_0-auc:0.98872
[14]	validation_0-auc:0.98873
[15]	validation_0-auc:0.98906
[16]	validation_0-auc:0.98950
[17]	validation_0-auc:0.98987
[18]	validation_0-auc:0.99001
[19]	validation_0-auc:0.99013
[20]	validation_0-auc:0.99019
[21]	validation_0-auc:0.99047
[22]	validation_0-auc:0.99070
[23]	validation_0-auc:0.99075
[24]	validation_0-auc:0.99091
[25]	validation_0-auc:0.99116
[26]	valid

In [16]:
## fetch the best parameters for xgbm
best_params_xgbm = bayes_optm_xgbm.max["params"]

params_xgbm = {"n_estimators": 1000,
              "learning_rate": best_params_xgbm["learning_rate"],
              "max_depth": int(round(best_params_xgbm["max_depth"])),
              "gamma": best_params_xgbm["gamma"],
              "subsample" : best_params_xgbm["subsample"],
              "colsample_bytree": best_params_xgbm["colsample_bytree"],
              "random_state" : 42,
              "reg_alpha": best_params_xgbm["reg_alpha"],
              "reg_lambda" : best_params_xgbm["reg_lambda"],
              "eval_metric" : "auc",
              "n_jobs" :-1,
              "early_stopping_rounds" : 50
              }



oof_train_xgbm = np.zeros(len(x_train))
xgbm_prediction = np.zeros(len(test_clean))
train_roc_auc_score_xgbm = []
val_roc_auc_score_xgbm = []


### build the model using the best parameters
## using stratified-10-fold to train model
skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
## loop through folds
for train_idx,val_idx in skfold.split(x_train,y_train):
    train_x = x_train.iloc[train_idx]
    train_y = y_train.iloc[train_idx]
    val_x = x_train.iloc[val_idx]
    val_y = y_train.iloc[val_idx]
    
    # fit model    
    model = XGBClassifier(**params_xgbm)
    model.fit(train_x, train_y, eval_set = [(val_x,val_y)])
    
    ## make prediction on validation set
    pred_xval = model.predict_proba(val_x)[:,1]
    val_roc = roc_auc_score(val_y, pred_xval)
    val_roc_auc_score_xgbm.append(val_roc)
    oof_train_xgbm[val_idx] = pred_xval
    
    ## make prediction on train set
    pred_xtrain = model.predict_proba(train_x)[:,1]
    train_roc = roc_auc_score(train_y, pred_xtrain)
    train_roc_auc_score_xgbm.append(train_roc)
    
    
    ## make prediction on test set    
    xgbm_prediction += model.predict_proba(test_clean)[:,1]
    
    




[0]	validation_0-auc:0.96503
[1]	validation_0-auc:0.97515
[2]	validation_0-auc:0.97887
[3]	validation_0-auc:0.97971
[4]	validation_0-auc:0.98220
[5]	validation_0-auc:0.98253
[6]	validation_0-auc:0.98452
[7]	validation_0-auc:0.98527
[8]	validation_0-auc:0.98645
[9]	validation_0-auc:0.98741
[10]	validation_0-auc:0.98727
[11]	validation_0-auc:0.98765
[12]	validation_0-auc:0.98811
[13]	validation_0-auc:0.98856
[14]	validation_0-auc:0.98872
[15]	validation_0-auc:0.98946
[16]	validation_0-auc:0.98970
[17]	validation_0-auc:0.99005
[18]	validation_0-auc:0.99023
[19]	validation_0-auc:0.99052
[20]	validation_0-auc:0.99075
[21]	validation_0-auc:0.99070
[22]	validation_0-auc:0.99112
[23]	validation_0-auc:0.99126
[24]	validation_0-auc:0.99149
[25]	validation_0-auc:0.99156
[26]	validation_0-auc:0.99161
[27]	validation_0-auc:0.99159
[28]	validation_0-auc:0.99159
[29]	validation_0-auc:0.99171
[30]	validation_0-auc:0.99185
[31]	validation_0-auc:0.99199
[32]	validation_0-auc:0.99210
[33]	validation_0-au

In [17]:
##Update the submission file target variable with the predictions
sample_submission.loan_status = xgbm_prediction/fold

##save the submission to file
sample_submission.to_csv("submission_xgbm.csv", index=False)

In [18]:
### Solution 3 using Lightboost

## initialize some pararameters in XGBM to optimize

## set fold value
fold = 10

### create the objective function for random forest
def lightgbm_clf(learning_rate, max_depth, num_leaves, subsample, colsample_bytree,reg_alpha,reg_lambda):
    
    roc_auc_scores = []
    
    ## create the parameter dictionary    
    params = {"n_estimators": 100,
              "learning_rate": learning_rate,
              "max_depth": int(round(max_depth)),
              "num_leaves": int(round(num_leaves)),
              "subsample" : subsample,
              "colsample_bytree": colsample_bytree,
              "random_state" : 42,
              "reg_alpha": reg_alpha,
              "reg_lambda" : reg_lambda,
              "n_jobs" :-1,
              "objective": "binary",
              "early_stopping_rounds" :50
              }
    
    ## using stratified-10-fold to train model
    skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
    ## loop through folds
    for train_idx,val_idx in skfold.split(x_train,y_train):
        train_x = x_train.iloc[train_idx]
        train_y = y_train.iloc[train_idx]
        val_x = x_train.iloc[val_idx]
        val_y = y_train.iloc[val_idx]
        
        model = LGBMClassifier(**params)
        model.fit(train_x, train_y, eval_set = [(val_x,val_y)], eval_metric="auc")
        pred_prob = model.predict_proba(val_x)[:,1]
        roc_auc = roc_auc_score(val_y, pred_prob)
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)
        

### set the parameter bound
pbound = {"learning_rate": (0.01,0.1),
          "max_depth": (5,25),
          "num_leaves" : (31,200),
          "subsample": (0.7,1),
          "colsample_bytree": (0.7,1),
          "reg_lambda": (0,1),
          "reg_alpha": (0, 1)}

## optimise the parameter
bayes_optm_lightgbm = bayes_opt.BayesianOptimization(f=lightgbm_clf, pbounds=pbound,verbose=2,random_state=42 )
bayes_optm_lightgbm.maximize(init_points=20, n_iter=40)


|   iter    |  target   | colsam... | learni... | max_depth | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 90531, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500006 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.993831	valid_0's binary_logloss: 0.0938303
[LightGBM] [Info] Number of positive: 45266, numb

In [19]:
## fetch the best parameters for lightgbm
best_params_lightgbm = bayes_optm_lightgbm.max["params"]

params_lightgbm = {"n_estimators": 100,
              "learning_rate": best_params_lightgbm["learning_rate"],
              "max_depth": int(round(best_params_lightgbm["max_depth"])),
              "num_leaves": int(round(best_params_lightgbm["num_leaves"])),
              "subsample" : best_params_lightgbm["subsample"],
              "colsample_bytree": best_params_lightgbm["colsample_bytree"],
              "random_state" : 42,
              "reg_alpha": best_params_lightgbm["reg_alpha"],
              "reg_lambda" : best_params_lightgbm["reg_lambda"],
              "n_jobs" :-1,
              "objective": "binary",
              "verbose": -1,
              "early_stopping_rounds" :50
              }



oof_train_lightgbm = np.zeros(len(x_train))
lightgbm_prediction = np.zeros(len(test_clean))
train_roc_auc_score_lightgbm = []
val_roc_auc_score_lightgbm = []


### build the model using the best parameters
## using stratified-10-fold to train model
skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
## loop through folds
for train_idx,val_idx in skfold.split(x_train,y_train):
    train_x = x_train.iloc[train_idx]
    train_y = y_train.iloc[train_idx]
    val_x = x_train.iloc[val_idx]
    val_y = y_train.iloc[val_idx]
    
    # fit model    
    model = LGBMClassifier(**params_xgbm)
    model.fit(train_x, train_y, eval_set = [(val_x,val_y)])
    
    ## make prediction on validation set
    pred_xval = model.predict_proba(val_x)[:,1]
    val_roc = roc_auc_score(val_y, pred_xval)
    val_roc_auc_score_lightgbm.append(val_roc)
    oof_train_lightgbm[val_idx] = pred_xval
    
    ## make prediction on train set
    pred_xtrain = model.predict_proba(train_x)[:,1]
    train_roc = roc_auc_score(train_y, pred_xtrain)
    train_roc_auc_score_lightgbm.append(train_roc)
    
    
    ## make prediction on test set    
    lightgbm_prediction += model.predict_proba(test_clean)[:,1]
    
    




[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 90531, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500006 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[626]	valid_0's binary_logloss: 0.0868742
[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM

In [20]:
##Update the submission file target variable with the predictions
sample_submission.loan_status = lightgbm_prediction/fold

##save the submission to file
sample_submission.to_csv("submission_lightgbm.csv", index=False)

In [36]:
### Catboost, GradientBoost, HistGradientBoost, Stacking, Voting

## create the objective function for catboost

def catboost_clf(learning_rate,depth,l2_leaf_reg,bagging_temperature):
    
    
    ## create parameter disctionary
    param = {"learning_rate" : learning_rate,
             "depth" : int(round(depth)), ## 5,16
             "eval_metric" : "AUC",
             "l2_leaf_reg": l2_leaf_reg, ##1-10
             "n_estimators" : 100,
             "bagging_temperature" : bagging_temperature,##0,1
             "loss_function" : "Logloss",
             "thread_count" : 4,
             "silent": True,
             "early_stopping_rounds" : 50,
             "random_state" : 42} 
    roc_auc = []
    fold =10
    stkfold = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)
    for train_idx, val_idx in stkfold.split(x_train,y_train):
        train_x, train_y = x_train.iloc[train_idx], y_train.iloc[train_idx]
        val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
        
        model = CatBoostClassifier(**param)
        model.fit(train_x,train_y, eval_set=[(val_x,val_y)], verbose=False)
        pred_catboost = model.predict_proba(val_x)[:,1]
        roc_catboost = roc_auc_score(val_y,pred_catboost)
        roc_auc.append(roc_catboost)
        
    return np.mean(roc_auc)
    

## Define the parameter bounds
pbound = {"learning_rate" : (0.01, 0.1),
          "depth" : (5,16),
          "l2_leaf_reg" : (1,10),
          "bagging_temperature" : (0,1)}

optimize_catboost = bayes_opt.BayesianOptimization(f=catboost_clf, pbounds=pbound, verbose=2)
optimize_catboost.maximize(init_points=20, n_iter=40)

        
        
        
        

|   iter    |  target   | baggin... |   depth   | l2_lea... | learni... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.9613   [39m | [39m0.07719  [39m | [39m5.195    [39m | [39m2.991    [39m | [39m0.04685  [39m |
| [35m2        [39m | [35m0.974    [39m | [35m0.1392   [39m | [35m5.184    [39m | [35m3.116    [39m | [35m0.08049  [39m |
| [39m3        [39m | [39m0.9579   [39m | [39m0.4014   [39m | [39m11.05    [39m | [39m5.599    [39m | [39m0.01287  [39m |
| [39m4        [39m | [39m0.9653   [39m | [39m0.9352   [39m | [39m7.15     [39m | [39m6.94     [39m | [39m0.04057  [39m |
| [35m5        [39m | [35m0.9772   [39m | [35m0.5647   [39m | [35m5.771    [39m | [35m5.19     [39m | [35m0.08056  [39m |
| [39m6        [39m | [39m0.9675   [39m | [39m0.8955   [39m | [39m13.11    [39m | [39m2.61     [39m | [39m0.01547  [39m |
| [39m7        [39m | [39m0.976    [39m | [

In [40]:
## fetch the best params
best_params_catboost = optimize_catboost.max["params"]

param_catboost = {"learning_rate" : best_params_catboost["learning_rate"],
             "depth" : int(round(best_params_catboost["depth"])), ## 5,16
             "eval_metric" : "AUC",
             "l2_leaf_reg": best_params_catboost["l2_leaf_reg"], ##1-10
             "n_estimators" : 100,
             "bagging_temperature" : best_params_catboost["bagging_temperature"],##0,1
             "loss_function" : "Logloss",
             "thread_count" : -1,
             "silent": True,
             "early_stopping_rounds" : 50,
             "random_state" : 42} 

oof_train_catboost = np.zeros(len(x_train))
test_prediction_catboost = np.zeros(len(test_clean))
train_roc_auc_score_catboost = []
val_roc_auc_score_catboost = []
fold = 10

stkfold = StratifiedKFold(n_splits=fold, shuffle=True, random_state=42)

for train_idx, val_idx in stkfold.split(x_train, y_train):
    train_x, train_y = x_train.iloc[train_idx], y_train.iloc[train_idx]
    val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    model = CatBoostClassifier(**param_catboost)
    model.fit(train_x,train_y, eval_set=[(val_x,val_y)])
    
    ## make prediction on trainset
    train_predict = model.predict_proba(train_x)[:,1]
    train_roc = roc_auc_score(train_y,train_predict)
    train_roc_auc_score_catboost.append(train_roc)
    
    ## make prediction on the validation set
    val_predict = model.predict_proba(val_x)[:,1]
    val_roc = roc_auc_score(val_y,val_predict)
    val_roc_auc_score_catboost.append(val_roc)
    
    oof_train_catboost[val_idx] = val_predict
    test_prediction_catboost += model.predict_proba(test_clean)[:,1]
    
    
    
    

In [47]:
##Update the submission file target variable with the predictions
sample_submission.loan_status = test_prediction_catboost/fold

##save the submission to file
sample_submission.to_csv("submission_catboost.csv", index=False)

In [51]:
## Train a stacking ensemble

oof_train_predictions = np.column_stack([oof_train_rf,oof_train_xgbm,oof_train_lightgbm,oof_train_catboost])
test_predictions = np.column_stack([rf_prediction/fold, xgbm_prediction/fold,lightgbm_prediction/fold,test_prediction_catboost/fold])

weights = {"catboost":np.mean(val_roc_auc_score_catboost),
           "randomforest": np.mean(val_roc_auc_score_rf),
           "xgbm" : np.mean(val_roc_auc_score_xgbm),
           "lightgbm": np.mean(val_roc_auc_score_lightgbm)}

### Optimize the meta-learner model using LightGBM

## initialize some pararameters in XGBM to optimize

## set fold value
fold = 10

### create the objective function for random forest
def stck_lightgbm_clf(learning_rate, max_depth, num_leaves, subsample, colsample_bytree,reg_alpha,reg_lambda):
    
    roc_auc_scores = []
    
    ## create the parameter dictionary    
    params = {"n_estimators": 100,
              "learning_rate": learning_rate,
              "max_depth": int(round(max_depth)),
              "num_leaves": int(round(num_leaves)),
              "subsample" : subsample,
              "colsample_bytree": colsample_bytree,
              "random_state" : 42,
              "reg_alpha": reg_alpha,
              "reg_lambda" : reg_lambda,
              "n_jobs" :-1,
              "objective": "binary",
              "early_stopping_rounds" :50
              }
    
    ## using stratified-10-fold to train model
    skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
    ## loop through folds
    for train_idx,val_idx in skfold.split(oof_train_predictions,y_train):
        train_x = oof_train_predictions.iloc[train_idx]
        train_y = y_train.iloc[train_idx]
        val_x = oof_train_predictions.iloc[val_idx]
        val_y = y_train.iloc[val_idx]
        
        model = LGBMClassifier(**params)
        model.fit(train_x, train_y, eval_set = [(val_x,val_y)], eval_metric="auc")
        pred_prob = model.predict_proba(val_x)[:,1]
        roc_auc = roc_auc_score(val_y, pred_prob)
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)
        

### set the parameter bound
pbound = {"learning_rate": (0.01,0.1),
          "max_depth": (5,25),
          "num_leaves" : (31,200),
          "subsample": (0.7,1),
          "colsample_bytree": (0.7,1),
          "reg_lambda": (0,1),
          "reg_alpha": (0, 1)}

## optimise the parameter
stck_bayes_optm_lightgbm = bayes_opt.BayesianOptimization(f=lightgbm_clf, pbounds=pbound,verbose=2,random_state=42 )
stck_bayes_optm_lightgbm.maximize(init_points=20, n_iter=40)


|   iter    |  target   | colsam... | learni... | max_depth | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 90531, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500006 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.993831	valid_0's binary_logloss: 0.0938303
[LightGBM] [Info] Number of positive: 45266, numb

In [57]:
## fetch the best parameters for lightgbm
stck_best_params_lightgbm = stck_bayes_optm_lightgbm.max["params"]

params_lightgbm = {"n_estimators": 100,
              "learning_rate": stck_best_params_lightgbm["learning_rate"],
              "max_depth": int(round(stck_best_params_lightgbm["max_depth"])),
              "num_leaves": int(round(stck_best_params_lightgbm["num_leaves"])),
              "subsample" : stck_best_params_lightgbm["subsample"],
              "colsample_bytree": stck_best_params_lightgbm["colsample_bytree"],
              "random_state" : 42,
              "reg_alpha": stck_best_params_lightgbm["reg_alpha"],
              "reg_lambda" : stck_best_params_lightgbm["reg_lambda"],
              "n_jobs" :-1,
              "objective": "binary",
              "verbose": -1,
              "early_stopping_rounds" :50
              }




stck_lightgbm_prediction = np.zeros(len(test_clean))
stck_val_roc_auc_score_lightgbm = []

## concert to dataframe to use iloc
oof_train_predictions_df = pd.DataFrame(oof_train_predictions)



### build the model using the best parameters
## using stratified-10-fold to train model
skfold = StratifiedKFold(n_splits=fold,shuffle=True, random_state=42)
    
## loop through folds
for train_idx,val_idx in skfold.split(oof_train_predictions_df,y_train):
    train_x = oof_train_predictions_df.iloc[train_idx]
    train_y = y_train.iloc[train_idx]
    val_x = oof_train_predictions_df.iloc[val_idx]
    val_y = y_train.iloc[val_idx]
    
    # fit model    
    model = LGBMClassifier(**params_xgbm)
    model.fit(train_x, train_y, eval_set = [(val_x,val_y)])
    
    ## make prediction on validation set
    pred_xval = model.predict_proba(val_x)[:,1]
    val_roc = roc_auc_score(val_y, pred_xval)
    stck_val_roc_auc_score_lightgbm.append(val_roc)
  
    
    ## make prediction on test set    
    stck_lightgbm_prediction += model.predict_proba(test_predictions)[:,1]
    
    




[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 90531, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500006 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[95]	valid_0's binary_logloss: 0.0715991
[LightGBM] [Info] Number of positive: 45266, number of negative: 45265
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 90531, number of used features: 4
[LightGBM] [I

In [58]:
##Update the submission file target variable with the predictions
sample_submission.loan_status = stck_lightgbm_prediction/fold

##save the submission to file
sample_submission.to_csv("submission_stack.csv", index=False)

In [62]:
## Implement soft voting
test_predictions = np.column_stack([rf_prediction/fold, xgbm_prediction/fold,lightgbm_prediction/fold,test_prediction_catboost/fold])

weights = [np.mean(val_roc_auc_score_rf), np.mean(val_roc_auc_score_xgbm),np.mean(val_roc_auc_score_lightgbm),np.mean(val_roc_auc_score_catboost)]

weights = np.array(weights)/sum(weights)

test_predictions = (test_predictions * weights).sum(axis=1)

##Update the submission file target variable with the predictions
sample_submission.loan_status = test_predictions

##save the submission to file
sample_submission.to_csv("submission_voting.csv", index=False)






In [70]:
### implement best public leader board prediction

## load the predictions
best_lbp = pd.read_csv("./submission_plb.csv")
xgbm_pred = pd.read_csv("./submission_xgbm.csv")


## implement weights to prediction
best_lbp.loan_status = (2* best_lbp.loan_status) + (-1 * xgbm_pred.loan_status)
best_lbp.loan_status = rankdata(best_lbp.loan_status)/len(best_lbp)

## save final prediction to file
best_lbp.to_csv("./submission_final.csv", index=False)






In [69]:
best_lbp

Unnamed: 0,id,loan_status
0,58645,0.920098
1,58646,0.462428
2,58647,0.862013
3,58648,0.198859
4,58649,0.044631
...,...,...
39093,97738,0.793314
39094,97739,0.680214
39095,97740,0.378306
39096,97741,0.836820
