In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score
import lightgbm
import optuna
import category_encoders as ce


import warnings
warnings.filterwarnings('ignore')

In [2]:
train_loan = pd.read_csv('credit risk train.csv')

In [3]:
test_loan = pd.read_csv('credit risk test.csv')

In [4]:
X = train_loan.drop(columns='bad_loans')
y = train_loan.bad_loans

In [5]:
# pymnt_plan has only one value
X.drop(columns='pymnt_plan', inplace=True)

In [6]:
# drop columns with collinearity
X.drop(columns=['funded_amnt', 'delinq_2yrs_zero', 'pub_rec'], inplace=True)

In [7]:
# change 'object' types to 'category'
cat_features=['grade', 'home_ownership', 'purpose']
X[cat_features] = X[cat_features].astype('category')

<h1>Light GBoost</h1>

<h3 dir='rtl'>
    برای هر مدل، کد زیر در گوگل کولب اجرا و هایپرپارامترهای بهینه به این کد منتقل شده است.
</h3>

<h3>Model For Data Without Preprocessing</h3>

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

In [9]:
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 812, 'colsample_bytree': 0.9, 'learning_rate': 0.005837357729089558,
          'reg_alpha': 4.5, 'reg_lambda': 5.0, 'subsample': 0.9086891990485108, 'min_child_samples': 61}

lgbwp = lightgbm.LGBMClassifier(**params)
lgbwp.fit(Xtrain, ytrain, 
        eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)])
pred = lgbwp.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40767451353925704


<h3>Encoding</h3>

In [10]:
cat_features = ['grade', 'home_ownership', 'purpose']
encoder = ce.TargetEncoder(cols=cat_features)
encoder.fit(X[cat_features], y)
X_enc = pd.concat([X, encoder.transform(X[cat_features]).add_suffix('_enc')], axis=1)
X_enc.drop(columns=['grade', 'home_ownership', 'purpose'], inplace=True)

<h3>Handling Missing Values</h3>

In [11]:
X_enc_imp = X_enc.copy()
X_enc_imp.payment_inc_ratio.fillna(0, inplace=True)
X_enc_imp.fillna(-1, inplace=True)

<h3>Train Test Split</h3>

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_enc_imp, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

<h3>Handling Outliers</h3>

In [13]:
# Log Transform
########### train #############
Xtrain_log = Xtrain.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtrain_log[c] = np.log(Xtrain_log[c] + 1.1)

########### valid #############
Xvalid_log = Xvalid.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xvalid_log[c] = np.log(Xvalid_log[c] + 1.1)

########### test #############
Xtest_log = Xtest.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtest_log[c] = np.log(Xtest_log[c] + 1.1)

In [14]:
# z-score outlier detection
########### train #############
Xtrain_clip = Xtrain.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',] 

for c in columns:
    if (Xtrain_clip[c].dtypes == int) | (Xtrain_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtrain_clip[c] = Xtrain_clip[c].clip(L,U)

########### valid #############
Xvalid_clip = Xvalid.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xvalid_clip[c].dtypes == int) | (Xvalid_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xvalid_clip[c] =Xvalid_clip[c].clip(L,U)

########### test #############
Xtest_clip = Xtest.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xtest_clip[c].dtypes == int) | (Xtest_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtest_clip[c] =Xtest_clip[c].clip(L,U)

In [15]:
# Log and z-score
########### train #############
Xtrain_log_clip = Xtrain_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtrain_log_clip = pd.concat([Xtrain_log_clip, Xtrain_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### valid #############
Xvalid_log_clip = Xvalid_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xvalid_log_clip = pd.concat([Xvalid_log_clip, Xvalid_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### test #############
Xtest_log_clip = Xtest_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtest_log_clip = pd.concat([Xtest_log_clip, Xtest_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

<h3>Normalization: MinMaxScaler</h3>

In [16]:
# Normalization
mmscaler = MinMaxScaler()

Xtrain_normal = mmscaler.fit_transform(Xtrain)
Xtrain_normal = pd.DataFrame(Xtrain_normal, columns=Xtrain.columns, index=Xtrain.index)

Xvalid_normal = mmscaler.transform(Xvalid)
Xvalid_normal = pd.DataFrame(Xvalid_normal, columns=Xvalid.columns, index=Xvalid.index)

Xtest_normal = mmscaler.transform(Xtest)
Xtest_normal = pd.DataFrame(Xtest_normal, columns=Xtest.columns, index=Xtest.index)

#############################
Xtrain_log_normal = mmscaler.fit_transform(Xtrain_log)
Xtrain_log_normal = pd.DataFrame(Xtrain_log_normal, columns=Xtrain_log.columns, index=Xtrain_log.index)

Xvalid_log_normal = mmscaler.transform(Xvalid_log)
Xvalid_log_normal = pd.DataFrame(Xvalid_log_normal, columns=Xvalid_log.columns, index=Xvalid_log.index)

Xtest_log_normal = mmscaler.transform(Xtest_log)
Xtest_log_normal = pd.DataFrame(Xtest_log_normal, columns=Xtest_log.columns, index=Xtest_log.index)

#############################
Xtrain_clip_normal = mmscaler.fit_transform(Xtrain_clip)
Xtrain_clip_normal = pd.DataFrame(Xtrain_clip_normal, columns=Xtrain_clip.columns, index=Xtrain_clip.index)

Xvalid_clip_normal = mmscaler.transform(Xvalid_clip)
Xvalid_clip_normal = pd.DataFrame(Xvalid_clip_normal, columns=Xvalid_clip.columns, index=Xvalid_clip.index)

Xtest_clip_normal = mmscaler.transform(Xtest_clip)
Xtest_clip_normal = pd.DataFrame(Xtest_clip_normal, columns=Xtest_clip.columns, index=Xtest_clip.index)

#############################
Xtrain_log_clip_normal = mmscaler.fit_transform(Xtrain_log_clip)
Xtrain_log_clip_normal = pd.DataFrame(Xtrain_log_clip_normal, columns=Xtrain_log_clip.columns, index=Xtrain_log_clip.index)

Xvalid_log_clip_normal = mmscaler.transform(Xvalid_log_clip)
Xvalid_log_clip_normal = pd.DataFrame(Xvalid_log_clip_normal, columns=Xvalid_log_clip.columns, index=Xvalid_log_clip.index)

Xtest_log_clip_normal = mmscaler.transform(Xtest_log_clip)
Xtest_log_clip_normal = pd.DataFrame(Xtest_log_clip_normal, columns=Xtest_log_clip.columns, index=Xtest_log_clip.index)

<h3>Over Sampling</h3>

In [17]:
# Over-Sampling
over_sampler = SMOTE()

########### train #############
Xtrain_over, ytrain_over = over_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_over, ytrain_normal_over = over_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_over, ytrain_log_normal_over = over_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_over, ytrain_clip_normal_over = over_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over = over_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>Under Sampling</h3>

In [18]:
# Under-Sampling
under_sampler = RandomUnderSampler()

########### train #############
Xtrain_under, ytrain_under = under_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_under, ytrain_normal_under = under_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_under, ytrain_log_normal_under = under_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_under, ytrain_clip_normal_under = under_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under = under_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>PCA</h3>

In [19]:
# PCA with Under Sampled Data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)

Xtrain_pca = pd.DataFrame(pca.fit_transform(Xtrain_under), index=Xtrain_under.index)
Xtrain_under_pca = pd.concat([Xtrain_under, Xtrain_pca], axis=1)
Xtrain_under_pca.columns=Xtrain_under_pca.columns.astype(str)

Xvalid_pca = pd.DataFrame(pca.transform(Xvalid), index=Xvalid.index)
Xvalid_pca = pd.concat([Xvalid, Xvalid_pca], axis=1)
Xvalid_pca.columns = Xvalid_pca.columns.astype(str)

Xtest_pca = pd.DataFrame(pca.transform(Xtest), index=Xtest.index)
Xtest_pca = pd.concat([Xtest, Xtest_pca], axis=1)
Xtest_pca.columns = Xtest_pca.columns.astype(str)

###########################################
Xtrain_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_normal_under), index=Xtrain_normal_under.index)
Xtrain_normal_under_pca = pd.concat([Xtrain_normal_under, Xtrain_normal_pca], axis=1)
Xtrain_normal_under_pca.columns=Xtrain_normal_under_pca.columns.astype(str)

Xvalid_normal_pca = pd.DataFrame(pca.transform(Xvalid_normal), index=Xvalid_normal.index)
Xvalid_normal_pca = pd.concat([Xvalid_normal, Xvalid_normal_pca], axis=1)
Xvalid_normal_pca.columns = Xvalid_normal_pca.columns.astype(str)

Xtest_normal_pca = pd.DataFrame(pca.transform(Xtest_normal), index=Xtest_normal.index)
Xtest_normal_pca = pd.concat([Xtest_normal, Xtest_normal_pca], axis=1)
Xtest_normal_pca.columns = Xtest_normal_pca.columns.astype(str)

###########################################
Xtrain_log_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_normal_under), index=Xtrain_log_normal_under.index)
Xtrain_log_normal_under_pca = pd.concat([Xtrain_log_normal_under, Xtrain_log_normal_pca], axis=1)
Xtrain_log_normal_under_pca.columns=Xtrain_log_normal_under_pca.columns.astype(str)

Xvalid_log_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_normal), index=Xvalid_log_normal.index)
Xvalid_log_normal_pca = pd.concat([Xvalid_log_normal, Xvalid_log_normal_pca], axis=1)
Xvalid_log_normal_pca.columns = Xvalid_log_normal_pca.columns.astype(str)

Xtest_log_normal_pca = pd.DataFrame(pca.transform(Xtest_log_normal), index=Xtest_log_normal.index)
Xtest_log_normal_pca = pd.concat([Xtest_log_normal, Xtest_log_normal_pca], axis=1)
Xtest_log_normal_pca.columns = Xtest_log_normal_pca.columns.astype(str)

###########################################
Xtrain_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_clip_normal_under), index=Xtrain_clip_normal_under.index)
Xtrain_clip_normal_under_pca = pd.concat([Xtrain_clip_normal_under, Xtrain_clip_normal_pca], axis=1)
Xtrain_clip_normal_under_pca.columns=Xtrain_clip_normal_under_pca.columns.astype(str)

Xvalid_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_clip_normal), index=Xvalid_clip_normal.index)
Xvalid_clip_normal_pca = pd.concat([Xvalid_clip_normal, Xvalid_clip_normal_pca], axis=1)
Xvalid_clip_normal_pca.columns=Xvalid_clip_normal_pca.columns.astype(str)

Xtest_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_clip_normal), index=Xtest_clip_normal.index)
Xtest_clip_normal_pca = pd.concat([Xtest_clip_normal, Xtest_clip_normal_pca], axis=1)
Xtest_clip_normal_pca.columns=Xtest_clip_normal_pca.columns.astype(str)

###########################################
Xtrain_log_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_clip_normal_under), index=Xtrain_log_clip_normal_under.index)
Xtrain_log_clip_normal_under_pca = pd.concat([Xtrain_log_clip_normal_under, Xtrain_log_clip_normal_pca], axis=1)
Xtrain_log_clip_normal_under_pca.columns=Xtrain_log_clip_normal_under_pca.columns.astype(str)

Xvalid_log_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_clip_normal), index=Xvalid_log_clip_normal.index)
Xvalid_log_clip_normal_pca = pd.concat([Xvalid_log_clip_normal, Xvalid_log_clip_normal_pca], axis=1)
Xvalid_log_clip_normal_pca.columns=Xvalid_log_clip_normal_pca.columns.astype(str)

Xtest_log_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_log_clip_normal), index=Xtest_log_clip_normal.index)
Xtest_log_clip_normal_pca = pd.concat([Xtest_log_clip_normal, Xtest_log_clip_normal_pca], axis=1)
Xtest_log_clip_normal_pca.columns=Xtest_log_clip_normal_pca.columns.astype(str)

<h3>Train Data</h3>

In [20]:
# Encoded Imputed Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 782, 'colsample_bytree': 0.5, 'learning_rate': 0.0022003317562072694, 
          'reg_alpha': 1.0, 'reg_lambda': 3.0, 'subsample': 0.3940411021541431, 'min_child_samples': 93}

lgb = lightgbm.LGBMClassifier(**params)
lgb.fit(Xtrain, ytrain, 
        eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)])
pred = lgb.predict(Xtest)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4025956826910343


In [21]:
# Encoded Imputed Over-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 782, 'colsample_bytree': 0.5, 'learning_rate': 0.0022003317562072694, 
          'reg_alpha': 1.0, 'reg_lambda': 3.0, 'subsample': 0.3940411021541431, 'min_child_samples': 93}

lgbo = lightgbm.LGBMClassifier(**params)
lgbo.fit(Xtrain_over, ytrain_over, 
        eval_set=[(Xtrain_over, ytrain_over), (Xvalid, yvalid)])
pred = lgbo.predict(Xtest)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.06976744186046512


In [22]:
# Encoded Imputed Under-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 4, 'num_leaves': 942, 'colsample_bytree': 0.6, 'learning_rate': 0.0031884393171321894, 
          'reg_alpha': 3.5, 'reg_lambda': 3.5, 'subsample': 0.7619278105516095, 'min_child_samples': 89}

lgbu = lightgbm.LGBMClassifier(**params)
lgbu.fit(Xtrain_under, ytrain_under, 
        eval_set=[(Xtrain_under, ytrain_under), (Xvalid, yvalid)])
pred = lgbu.predict(Xtest)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40377163523637305


In [23]:
# Encoded Imputed Under-Sampled Data With PCA
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 4, 'num_leaves': 942, 'colsample_bytree': 0.9, 'learning_rate': 0.007483883068924781, 
          'reg_alpha': 3.0, 'reg_lambda': 2.5, 'subsample': 0.5541645334105965, 'min_child_samples': 16}

lgbup = lightgbm.LGBMClassifier(**params)
lgbup.fit(Xtrain_under_pca, ytrain_under, 
        eval_set=[(Xtrain_under_pca, ytrain_under), (Xvalid_pca, yvalid)])
pred = lgbup.predict(Xtest_pca)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4063310845874416


<h3>Normal Data</h3>

In [24]:
# Encoded Imputed Normal Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 572, 'colsample_bytree': 0.65, 'learning_rate': 0.16120536745090905, 
          'reg_alpha': 4.5, 'reg_lambda': 0.5, 'subsample': 0.9392936984622304, 'min_child_samples': 17}

lgbn = lightgbm.LGBMClassifier(**params)
lgbn.fit(Xtrain_normal, ytrain, 
        eval_set=[(Xtrain_normal, ytrain), (Xvalid_normal, yvalid)])
pred = lgbn.predict(Xtest_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3987025273685633


In [25]:
# Encoded Imputed Normal Over-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 572, 'colsample_bytree': 0.65, 'learning_rate': 0.16120536745090905, 
          'reg_alpha': 4.5, 'reg_lambda': 0.5, 'subsample': 0.9392936984622304, 'min_child_samples': 17}

lgbno = lightgbm.LGBMClassifier(**params)
lgbno.fit(Xtrain_normal_over, ytrain_over,
        eval_set=[(Xtrain_normal_over, ytrain_over), (Xvalid_normal, yvalid)])
pred = lgbno.predict(Xtest_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.09041309431021044


In [26]:
# Encoded Imputed Normal Under-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 8, 'num_leaves': 82, 'colsample_bytree': 0.5, 'learning_rate': 0.015473868125686798, 
          'reg_alpha': 5.0, 'reg_lambda': 2.0, 'subsample': 0.17531548152782445, 'min_child_samples': 75}

lgbnu = lightgbm.LGBMClassifier(**params)
lgbnu.fit(Xtrain_normal_under, ytrain_under, 
        eval_set=[(Xtrain_normal_under, ytrain_under), (Xvalid_normal, yvalid)])
pred = lgbnu.predict(Xtest_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3989985505336672


In [27]:
# Encoded Imputed Normal Under-Sampled Data With PCA
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 832, 'colsample_bytree': 0.6, 'learning_rate': 0.04672714075153401, 
          'reg_alpha': 5.0, 'reg_lambda': 2.5, 'subsample': 0.11921562689004872, 'min_child_samples': 9}

lgbnup = lightgbm.LGBMClassifier(**params)
lgbnup.fit(Xtrain_normal_under_pca, ytrain_under, 
        eval_set=[(Xtrain_normal_under_pca, ytrain_under), (Xvalid_normal_pca, yvalid)])
pred = lgbnup.predict(Xtest_normal_pca)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40047675804529204


<h3>Log Normal Data</h3>

In [28]:
# Encoded Imputed Log Normal Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 422, 'colsample_bytree': 0.5, 'learning_rate': 0.002149846086169651,
          'reg_alpha': 0.5, 'reg_lambda': 3.5, 'subsample': 0.06210490031638616, 'min_child_samples': 71}

lgbln = lightgbm.LGBMClassifier(**params)
lgbln.fit(Xtrain_log_normal, ytrain, 
        eval_set=[(Xtrain_log_normal, ytrain), (Xvalid_log_normal, yvalid)])
pred = lgbln.predict(Xtest_log_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4027593526134253


In [29]:
# Encoded Imputed Log Normal Over-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 14, 'num_leaves': 832, 'colsample_bytree': 0.95, 'learning_rate': 0.9763555559130269,
          'reg_alpha': 0.0, 'reg_lambda': 4.0, 'subsample': 0.4897835467974713, 'min_child_samples': 21}

lgblno = lightgbm.LGBMClassifier(**params)
lgblno.fit(Xtrain_log_normal_over, ytrain_over, 
        eval_set=[(Xtrain_log_normal_over, ytrain_over), (Xvalid_log_normal, yvalid)])
pred = lgblno.predict(Xtest_log_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.24115755627009647


In [30]:
# Encoded Imputed Log Normal Under-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 12, 'num_leaves': 792, 'colsample_bytree': 0.9, 'learning_rate': 0.024169195486965687, 
          'reg_alpha': 5.0, 'reg_lambda': 2.5, 'subsample': 0.06445086450108806, 'min_child_samples': 83}

lgblnu = lightgbm.LGBMClassifier(**params)
lgblnu.fit(Xtrain_log_normal_under, ytrain_under, 
        eval_set=[(Xtrain_log_normal_under, ytrain_under), (Xvalid_log_normal, yvalid)])
pred = lgblnu.predict(Xtest_log_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40272858454676635


In [31]:
# Encoded Imputed Log Normal Under-Sampled Data With PCA
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 7, 'num_leaves': 202, 'colsample_bytree': 0.6, 'learning_rate': 0.10376214122630514,
          'reg_alpha': 5.0, 'reg_lambda': 3.5, 'subsample': 0.36525360253163935, 'min_child_samples': 50}

lgblnup = lightgbm.LGBMClassifier(**params)
lgblnup.fit(Xtrain_log_normal_under_pca, ytrain_under, 
        eval_set=[(Xtrain_log_normal_under_pca, ytrain_under), (Xvalid_log_normal_pca, yvalid)])
pred = lgblnup.predict(Xtest_log_normal_pca)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4022536687631027


<h3>Clip Normal Data</h3>

In [32]:
# Encoded Imputed Clip Normal Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 472, 'colsample_bytree': 0.5, 'learning_rate': 0.12232101644285472,
          'reg_alpha': 0.0, 'reg_lambda': 4.0, 'subsample': 0.7490870473609526, 'min_child_samples': 39}

lgbcn = lightgbm.LGBMClassifier(**params)
lgbcn.fit(Xtrain_clip_normal, ytrain, 
        eval_set=[(Xtrain_clip_normal, ytrain), (Xvalid_clip_normal, yvalid)])
pred = lgbcn.predict(Xtest_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40102412073844496


In [33]:
# Encoded Imputed Clip Normal Over-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 12, 'num_leaves': 822, 'colsample_bytree': 0.55, 'learning_rate': 0.9792590489384606,
          'reg_alpha': 0.0, 'reg_lambda': 1.5, 'subsample': 0.49264441710461976, 'min_child_samples': 12}

lgbcno = lightgbm.LGBMClassifier(**params)
lgbcno.fit(Xtrain_clip_normal_over, ytrain_over, 
        eval_set=[(Xtrain_clip_normal_over, ytrain_over), (Xvalid_clip_normal, yvalid)])
pred = lgbcno.predict(Xtest_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.25408737603859555


In [34]:
# Encoded Imputed Clip Normal Under-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 9, 'num_leaves': 12, 'colsample_bytree': 0.9, 'learning_rate': 0.06591518154396729,
          'reg_alpha': 4.5, 'reg_lambda': 2.5, 'subsample': 0.24200519959280795, 'min_child_samples': 92}

lgbcnu = lightgbm.LGBMClassifier(**params)
lgbcnu.fit(Xtrain_clip_normal_under, ytrain_under, 
        eval_set=[(Xtrain_clip_normal_under, ytrain_under), (Xvalid_clip_normal, yvalid)])
pred = lgbcnu.predict(Xtest_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40325287207951466


In [35]:
# Encoded Imputed Clip Normal Under-Sampled Data With PCA
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 12, 'num_leaves': 622, 'colsample_bytree': 0.6, 'learning_rate': 0.006348480916855087, 
          'reg_alpha': 2.0, 'reg_lambda': 4.5, 'subsample': 0.3264718443152631, 'min_child_samples': 71}

lgbcnup = lightgbm.LGBMClassifier(**params)
lgbcnup.fit(Xtrain_clip_normal_under_pca, ytrain_under, 
        eval_set=[(Xtrain_clip_normal_under_pca, ytrain_under), (Xvalid_clip_normal_pca, yvalid)])
pred = lgbcnup.predict(Xtest_clip_normal_pca)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4062540379894043


<h3>Log Clip Normal Data</h3>

In [36]:
# Encoded Imputed Log Clip Normal Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 3, 'num_leaves': 92, 'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.01860618800332124,
          'reg_alpha': 0.5, 'reg_lambda': 1.5, 'subsample': 0.6690051462577666, 'min_child_samples': 100}

lgblcn = lightgbm.LGBMClassifier(**params)
lgblcn.fit(Xtrain_log_clip_normal, ytrain, 
        eval_set=[(Xtrain_log_clip_normal, ytrain), (Xvalid_log_clip_normal, yvalid)])
pred = lgblcn.predict(Xtest_log_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4030438918331295


In [37]:
# Encoded Imputed Log Clip Normal Over-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 13, 'num_leaves': 652, 'colsample_bytree': 0.7, 'learning_rate': 0.9623660426493325, 
          'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.38048882735387823, 'min_child_samples': 60}

lgblcno = lightgbm.LGBMClassifier(**params)
lgblcno.fit(Xtrain_log_clip_normal_over, ytrain_over,
        eval_set=[(Xtrain_log_clip_normal_over, ytrain_over), (Xvalid_log_clip_normal, yvalid)])
pred = lgblcno.predict(Xtest_log_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.18685767673621925


In [38]:
# Encoded Imputed Log Clip Normal Under-Sampled Data
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 4, 'num_leaves': 202, 'colsample_bytree': 0.7, 'learning_rate': 0.05098137646159855, 
          'reg_alpha': 5.0, 'reg_lambda': 4.5, 'subsample': 0.45387139781445063, 'min_child_samples': 80}

lgblcnu = lightgbm.LGBMClassifier(**params)
lgblcnu.fit(Xtrain_log_clip_normal_under, ytrain_under,
        eval_set=[(Xtrain_log_clip_normal_under, ytrain_under), (Xvalid_log_clip_normal, yvalid)])
pred = lgblcnu.predict(Xtest_log_clip_normal)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4031028487361241


In [39]:
# Encoded Imputed Log Clip Normal Under-Sampled Data With PCA
params = {'n_estimators': 5000, 'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
          'boosting_type': 'gbdt', 'early_stopping_rounds': 10,'class_weight': 'balanced',
          'max_depth': 7, 'num_leaves': 392, 'colsample_bytree': 0.75, 'learning_rate': 0.0023716324949762135,
          'reg_alpha': 5.0, 'reg_lambda': 5.0, 'subsample': 0.9869268174164925, 'min_child_samples': 64}

lgblcnup = lightgbm.LGBMClassifier(**params)
lgblcnup.fit(Xtrain_log_clip_normal_under_pca, ytrain_under,
        eval_set=[(Xtrain_log_clip_normal_under_pca, ytrain_under), (Xvalid_log_clip_normal_pca, yvalid)])
pred = lgblcnup.predict(Xtest_log_clip_normal_pca)
f1 = f1_score(y_true=ytest, y_pred=pred)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40503616394320924
