In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score
import xgboost
import optuna
import category_encoders as ce


import warnings
warnings.filterwarnings('ignore')

In [2]:
train_loan = pd.read_csv('credit risk train.csv')

In [3]:
test_loan = pd.read_csv('credit risk test.csv')

In [4]:
X = train_loan.drop(columns='bad_loans')
y = train_loan.bad_loans

In [5]:
# pymnt_plan has only one value
X.drop(columns='pymnt_plan', inplace=True)

In [6]:
# drop columns with collinearity
X.drop(columns=['funded_amnt', 'delinq_2yrs_zero', 'pub_rec'], inplace=True)

<h1>XGBoost</h1>

<h3 dir='rtl'>
    برای هر مدل، کد زیر در گوگل کولب اجرا و هایپرپارامترهای بهینه به این کد منتقل شده است.
</h3>

<h3>Encoding</h3>

In [7]:
cat_features = ['grade', 'home_ownership', 'purpose']
encoder = ce.TargetEncoder(cols=cat_features)
encoder.fit(X[cat_features], y)
X_enc = pd.concat([X, encoder.transform(X[cat_features]).add_suffix('_enc')], axis=1)
X_enc.drop(columns=['grade', 'home_ownership', 'purpose'], inplace=True)

<h3>Model For Only Encoded Data</h3>

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_enc, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

In [9]:
# Only Encoded Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 11, 'colsample_bytree': 0.5, 'learning_rate': 0.00346524895110463,
          'reg_alpha': 4.0, 'reg_lambda': 0.0, 'subsample': 0.24208584032741026, 'min_child_weight': 20}

xgb_enc = xgboost.XGBClassifier(**params)
xgb_enc.fit(Xtrain, ytrain, eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)], early_stopping_rounds=10, verbose=0)
pred = xgb_enc.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3958453548759377


<h3>Handling Missing Values</h3>

In [10]:
X_enc_imp = X_enc.copy()
X_enc_imp.payment_inc_ratio.fillna(0, inplace=True)
X_enc_imp.fillna(-1, inplace=True)

<h3>Train Test Split</h3>

In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_enc_imp, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

<h3>Handling Outliers</h3>

In [12]:
# Log Transform
########### train #############
Xtrain_log = Xtrain.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtrain_log[c] = np.log(Xtrain_log[c] + 1.1)

########### valid #############
Xvalid_log = Xvalid.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xvalid_log[c] = np.log(Xvalid_log[c] + 1.1)

########### test #############
Xtest_log = Xtest.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtest_log[c] = np.log(Xtest_log[c] + 1.1)

In [13]:
# z-score outlier detection
########### train #############
Xtrain_clip = Xtrain.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',] 

for c in columns:
    if (Xtrain_clip[c].dtypes == int) | (Xtrain_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtrain_clip[c] = Xtrain_clip[c].clip(L,U)

########### valid #############
Xvalid_clip = Xvalid.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xvalid_clip[c].dtypes == int) | (Xvalid_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xvalid_clip[c] =Xvalid_clip[c].clip(L,U)

########### test #############
Xtest_clip = Xtest.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xtest_clip[c].dtypes == int) | (Xtest_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtest_clip[c] =Xtest_clip[c].clip(L,U)

In [14]:
# Log and z-score
########### train #############
Xtrain_log_clip = Xtrain_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtrain_log_clip = pd.concat([Xtrain_log_clip, Xtrain_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### valid #############
Xvalid_log_clip = Xvalid_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xvalid_log_clip = pd.concat([Xvalid_log_clip, Xvalid_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### test #############
Xtest_log_clip = Xtest_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtest_log_clip = pd.concat([Xtest_log_clip, Xtest_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

<h3>Normalization: MinMaxScaler</h3>

In [15]:
# Normalization
mmscaler = MinMaxScaler()

Xtrain_normal = mmscaler.fit_transform(Xtrain)
Xtrain_normal = pd.DataFrame(Xtrain_normal, columns=Xtrain.columns, index=Xtrain.index)

Xvalid_normal = mmscaler.transform(Xvalid)
Xvalid_normal = pd.DataFrame(Xvalid_normal, columns=Xvalid.columns, index=Xvalid.index)

Xtest_normal = mmscaler.transform(Xtest)
Xtest_normal = pd.DataFrame(Xtest_normal, columns=Xtest.columns, index=Xtest.index)

#############################
Xtrain_log_normal = mmscaler.fit_transform(Xtrain_log)
Xtrain_log_normal = pd.DataFrame(Xtrain_log_normal, columns=Xtrain_log.columns, index=Xtrain_log.index)

Xvalid_log_normal = mmscaler.transform(Xvalid_log)
Xvalid_log_normal = pd.DataFrame(Xvalid_log_normal, columns=Xvalid_log.columns, index=Xvalid_log.index)

Xtest_log_normal = mmscaler.transform(Xtest_log)
Xtest_log_normal = pd.DataFrame(Xtest_log_normal, columns=Xtest_log.columns, index=Xtest_log.index)

#############################
Xtrain_clip_normal = mmscaler.fit_transform(Xtrain_clip)
Xtrain_clip_normal = pd.DataFrame(Xtrain_clip_normal, columns=Xtrain_clip.columns, index=Xtrain_clip.index)

Xvalid_clip_normal = mmscaler.transform(Xvalid_clip)
Xvalid_clip_normal = pd.DataFrame(Xvalid_clip_normal, columns=Xvalid_clip.columns, index=Xvalid_clip.index)

Xtest_clip_normal = mmscaler.transform(Xtest_clip)
Xtest_clip_normal = pd.DataFrame(Xtest_clip_normal, columns=Xtest_clip.columns, index=Xtest_clip.index)

#############################
Xtrain_log_clip_normal = mmscaler.fit_transform(Xtrain_log_clip)
Xtrain_log_clip_normal = pd.DataFrame(Xtrain_log_clip_normal, columns=Xtrain_log_clip.columns, index=Xtrain_log_clip.index)

Xvalid_log_clip_normal = mmscaler.transform(Xvalid_log_clip)
Xvalid_log_clip_normal = pd.DataFrame(Xvalid_log_clip_normal, columns=Xvalid_log_clip.columns, index=Xvalid_log_clip.index)

Xtest_log_clip_normal = mmscaler.transform(Xtest_log_clip)
Xtest_log_clip_normal = pd.DataFrame(Xtest_log_clip_normal, columns=Xtest_log_clip.columns, index=Xtest_log_clip.index)

<h3>Over Sampling</h3>

In [16]:
# Over-Sampling
over_sampler = SMOTE()

########### train #############
Xtrain_over, ytrain_over = over_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_over, ytrain_normal_over = over_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_over, ytrain_log_normal_over = over_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_over, ytrain_clip_normal_over = over_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over = over_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>Under Sampling</h3>

In [17]:
# Under-Sampling
under_sampler = RandomUnderSampler()

########### train #############
Xtrain_under, ytrain_under = under_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_under, ytrain_normal_under = under_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_under, ytrain_log_normal_under = under_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_under, ytrain_clip_normal_under = under_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under = under_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>PCA</h3>

In [18]:
# PCA with Over Sampled Data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)

Xtrain_pca = pd.DataFrame(pca.fit_transform(Xtrain_over), index=Xtrain_over.index)
Xtrain_over_pca = pd.concat([Xtrain_over, Xtrain_pca], axis=1)
Xtrain_over_pca.columns=Xtrain_over_pca.columns.astype(str)

Xvalid_pca = pd.DataFrame(pca.transform(Xvalid), index=Xvalid.index)
Xvalid_pca = pd.concat([Xvalid, Xvalid_pca], axis=1)
Xvalid_pca.columns = Xvalid_pca.columns.astype(str)

Xtest_pca = pd.DataFrame(pca.transform(Xtest), index=Xtest.index)
Xtest_pca = pd.concat([Xtest, Xtest_pca], axis=1)
Xtest_pca.columns = Xtest_pca.columns.astype(str)

###########################################
Xtrain_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_normal_over), index=Xtrain_normal_over.index)
Xtrain_normal_over_pca = pd.concat([Xtrain_normal_over, Xtrain_normal_pca], axis=1)
Xtrain_normal_over_pca.columns=Xtrain_normal_over_pca.columns.astype(str)

Xvalid_normal_pca = pd.DataFrame(pca.transform(Xvalid_normal), index=Xvalid_normal.index)
Xvalid_normal_pca = pd.concat([Xvalid_normal, Xvalid_normal_pca], axis=1)
Xvalid_normal_pca.columns = Xvalid_normal_pca.columns.astype(str)

Xtest_normal_pca = pd.DataFrame(pca.transform(Xtest_normal), index=Xtest_normal.index)
Xtest_normal_pca = pd.concat([Xtest_normal, Xtest_normal_pca], axis=1)
Xtest_normal_pca.columns = Xtest_normal_pca.columns.astype(str)

###########################################
Xtrain_log_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_normal_over), index=Xtrain_log_normal_over.index)
Xtrain_log_normal_over_pca = pd.concat([Xtrain_log_normal_over, Xtrain_log_normal_pca], axis=1)
Xtrain_log_normal_over_pca.columns=Xtrain_log_normal_over_pca.columns.astype(str)

Xvalid_log_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_normal), index=Xvalid_log_normal.index)
Xvalid_log_normal_pca = pd.concat([Xvalid_log_normal, Xvalid_log_normal_pca], axis=1)
Xvalid_log_normal_pca.columns = Xvalid_log_normal_pca.columns.astype(str)

Xtest_log_normal_pca = pd.DataFrame(pca.transform(Xtest_log_normal), index=Xtest_log_normal.index)
Xtest_log_normal_pca = pd.concat([Xtest_log_normal, Xtest_log_normal_pca], axis=1)
Xtest_log_normal_pca.columns = Xtest_log_normal_pca.columns.astype(str)

###########################################
Xtrain_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_clip_normal_over), index=Xtrain_clip_normal_over.index)
Xtrain_clip_normal_over_pca = pd.concat([Xtrain_clip_normal_over, Xtrain_clip_normal_pca], axis=1)
Xtrain_clip_normal_over_pca.columns=Xtrain_clip_normal_over_pca.columns.astype(str)

Xvalid_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_clip_normal), index=Xvalid_clip_normal.index)
Xvalid_clip_normal_pca = pd.concat([Xvalid_clip_normal, Xvalid_clip_normal_pca], axis=1)
Xvalid_clip_normal_pca.columns=Xvalid_clip_normal_pca.columns.astype(str)

Xtest_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_clip_normal), index=Xtest_clip_normal.index)
Xtest_clip_normal_pca = pd.concat([Xtest_clip_normal, Xtest_clip_normal_pca], axis=1)
Xtest_clip_normal_pca.columns=Xtest_clip_normal_pca.columns.astype(str)

###########################################
Xtrain_log_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_clip_normal_over), index=Xtrain_log_clip_normal_over.index)
Xtrain_log_clip_normal_over_pca = pd.concat([Xtrain_log_clip_normal_over, Xtrain_log_clip_normal_pca], axis=1)
Xtrain_log_clip_normal_over_pca.columns=Xtrain_log_clip_normal_over_pca.columns.astype(str)

Xvalid_log_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_clip_normal), index=Xvalid_log_clip_normal.index)
Xvalid_log_clip_normal_pca = pd.concat([Xvalid_log_clip_normal, Xvalid_log_clip_normal_pca], axis=1)
Xvalid_log_clip_normal_pca.columns=Xvalid_log_clip_normal_pca.columns.astype(str)

Xtest_log_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_log_clip_normal), index=Xtest_log_clip_normal.index)
Xtest_log_clip_normal_pca = pd.concat([Xtest_log_clip_normal, Xtest_log_clip_normal_pca], axis=1)
Xtest_log_clip_normal_pca.columns=Xtest_log_clip_normal_pca.columns.astype(str)


<h3>Train Data</h3>

In [19]:
# Encoded Imputed Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 95, 'colsample_bytree': 0.75, 'learning_rate': 0.001855037241855926,
          'reg_alpha': 1.0, 'reg_lambda': 4.0, 'subsample': 0.18408267179093574, 'min_child_weight': 19}

xgb = xgboost.XGBClassifier(**params)
xgb.fit(Xtrain, ytrain, 
        eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)], 
        early_stopping_rounds=10, verbose=0)
pred = xgb.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4046965538273864


In [20]:
# Encoded Imputed Over-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 95, 'colsample_bytree': 0.75, 'learning_rate': 0.001855037241855926,
          'reg_alpha': 1.0, 'reg_lambda': 4.0, 'subsample': 0.18408267179093574, 'min_child_weight': 19}

xgbo = xgboost.XGBClassifier(**params)
xgbo.fit(Xtrain_over, ytrain_over, 
         eval_set=[(Xtrain_over, ytrain_over), (Xvalid, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgbo.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4007680702235633


In [21]:
# Encoded Imputed Under-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 96, 'colsample_bytree': 0.95, 'learning_rate': 0.024306139084256037,
          'reg_alpha': 1.5, 'reg_lambda': 2.0, 'subsample': 0.9374810434415121, 'min_child_weight': 8}

xgbu = xgboost.XGBClassifier(**params)
xgbu.fit(Xtrain_under, ytrain_under, 
         eval_set=[(Xtrain_under, ytrain_under), (Xvalid, yvalid)],
         early_stopping_rounds=10, verbose=0)
pred = xgbu.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.36980556614563476


In [22]:
# Encoded Imputed Over-Sampled Data With PCA
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 56, 'colsample_bytree': 0.55, 'learning_rate': 0.006779480252541074,
          'reg_alpha': 4.0, 'reg_lambda': 0.5, 'subsample': 0.0599803312372334, 'min_child_weight': 20}

xgbop = xgboost.XGBClassifier(**params)
xgbop.fit(Xtrain_over_pca, ytrain_over, 
          eval_set=[(Xtrain_over_pca, ytrain_over), (Xvalid_pca, yvalid)],
          early_stopping_rounds=10, verbose=0)
pred = xgbop.predict(Xtest_pca)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4063617309826162


<h3>Normal Data</h3>

In [23]:
# Encoded Imputed Normal Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 60, 'colsample_bytree': 0.65, 'learning_rate': 0.0010772332826208826, 
          'reg_alpha': 4.0, 'reg_lambda': 2.0, 'subsample': 0.415940817687507, 'min_child_weight': 12}

xgbn = xgboost.XGBClassifier(**params)
xgbn.fit(Xtrain_normal, ytrain, 
        eval_set=[(Xtrain_normal, ytrain), (Xvalid_normal, yvalid)], 
        early_stopping_rounds=10, verbose=0)
pred = xgbn.predict(Xtest_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39910170035290343


In [24]:
# Encoded Imputed Normal Over-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 8, 'colsample_bytree': 0.6, 'learning_rate': 0.004983150455160385, 
          'reg_alpha': 4.0, 'reg_lambda': 5.0, 'subsample': 0.6147893306916671, 'min_child_weight': 7}

xgbno = xgboost.XGBClassifier(**params)
xgbno.fit(Xtrain_normal_over, 
         ytrain_over, eval_set=[(Xtrain_normal_over, ytrain_over), (Xvalid_normal, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgbno.predict(Xtest_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40663470260463913


In [25]:
# Encoded Imputed Normal Under-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 56, 'colsample_bytree': 0.8, 'learning_rate': 0.007471892498790771, 
          'reg_alpha': 1.0, 'reg_lambda': 2.0, 'subsample': 0.849772056255833, 'min_child_weight': 5}

xgbnu = xgboost.XGBClassifier(**params)
xgbnu.fit(Xtrain_normal_under, ytrain_under, 
         eval_set=[(Xtrain_normal_under, ytrain_under), (Xvalid_normal, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgbnu.predict(Xtest_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3705665374066017


In [26]:
# Encoded Imputed Normal Over-Sampled Data With PCA
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 25, 'colsample_bytree': 0.55, 'learning_rate': 0.002849293382536758, 
          'reg_alpha': 4.0, 'reg_lambda': 2.5, 'subsample': 0.30396641193745594, 'min_child_weight': 7}

xgbnop = xgboost.XGBClassifier(**params)
xgbnop.fit(Xtrain_normal_over_pca, ytrain_over, 
           eval_set=[(Xtrain_normal_over_pca, ytrain_over), (Xvalid_normal_pca, yvalid)], 
           early_stopping_rounds=10, verbose=0)
pred = xgbnop.predict(Xtest_normal_pca)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39048239895697523


<h3>Log Normal Data</h3>

In [27]:
# Encoded Imputed Log Normal Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 3, 'colsample_bytree': 0.75, 'learning_rate': 0.08057545858414107, 
          'reg_alpha': 5.0, 'reg_lambda': 0.5, 'subsample': 0.5318325637625425, 'min_child_weight': 16}

xgbln = xgboost.XGBClassifier(**params)
xgbln.fit(Xtrain_log_normal, ytrain, 
        eval_set=[(Xtrain_log_normal, ytrain), (Xvalid_log_normal, yvalid)], 
        early_stopping_rounds=10, verbose=0)
pred = xgbln.predict(Xtest_log_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40963855421686746


In [28]:
# Encoded Imputed Log Normal Over-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 4, 'colsample_bytree': 1.0, 'learning_rate': 0.041772911731862296,
          'reg_alpha': 3.5, 'reg_lambda': 3.5, 'subsample': 0.39814802506020747, 'min_child_weight': 11}

xgblno = xgboost.XGBClassifier(**params)
xgblno.fit(Xtrain_log_normal_over, ytrain_over, 
         eval_set=[(Xtrain_log_normal_over, ytrain_over), (Xvalid_log_normal, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgblno.predict(Xtest_log_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4036222509702458


In [29]:
# Encoded Imputed Log Normal Under-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 86, 'colsample_bytree': 0.5, 'learning_rate': 0.17202407790279978, 
          'reg_alpha': 2.0, 'reg_lambda': 2.5, 'subsample': 0.880394498507279, 'min_child_weight': 5}

xgblnu = xgboost.XGBClassifier(**params)
xgblnu.fit(Xtrain_log_normal_under, ytrain_under, 
         eval_set=[(Xtrain_log_normal_under, ytrain_under), (Xvalid_log_normal, yvalid)],
         early_stopping_rounds=10, verbose=0)
pred = xgblnu.predict(Xtest_log_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3650537123300694


In [30]:
# Encoded Imputed Log Normal Over-Sampled Data With PCA
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 68, 'colsample_bytree': 1.0, 'learning_rate': 0.006596469437086368,
          'reg_alpha': 0.0, 'reg_lambda': 1.5, 'subsample': 0.07925734506967524, 'min_child_weight': 20}

xgblnop = xgboost.XGBClassifier(**params)
xgblnop.fit(Xtrain_log_normal_over_pca, ytrain_over, 
          eval_set=[(Xtrain_log_normal_over_pca, ytrain_over), (Xvalid_log_normal_pca, yvalid)],
          early_stopping_rounds=10, verbose=0)
pred = xgblnop.predict(Xtest_log_normal_pca)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39854709418837675


<h3>Clip Normal Data</h3>

In [31]:
# Encoded Imputed Clip Normal Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 68, 'colsample_bytree': 0.75, 'learning_rate': 0.0010612071162546235,
          'reg_alpha': 5.0, 'reg_lambda': 2.0, 'subsample': 0.5071775160024332, 'min_child_weight': 7}

xgbcn = xgboost.XGBClassifier(**params)
xgbcn.fit(Xtrain_clip_normal, ytrain, 
        eval_set=[(Xtrain_clip_normal, ytrain), (Xvalid_clip_normal, yvalid)], 
        early_stopping_rounds=10, verbose=0)
pred = xgbcn.predict(Xtest_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39531329597554765


In [32]:
# Encoded Imputed Clip Normal Over-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 59, 'colsample_bytree': 0.55, 'learning_rate': 0.0026688184026826007,
          'reg_alpha': 4.0, 'reg_lambda': 0.5, 'subsample': 0.27937443663195477, 'min_child_weight': 13}

xgbcno = xgboost.XGBClassifier(**params)
xgbcno.fit(Xtrain_clip_normal_over, ytrain_over, 
         eval_set=[(Xtrain_clip_normal_over, ytrain_over), (Xvalid_clip_normal, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgbcno.predict(Xtest_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4023619884647075


In [33]:
# Encoded Imputed Clip Normal Under-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 49, 'colsample_bytree': 0.9, 'learning_rate': 0.03907852874823776,
          'reg_alpha': 0.0, 'reg_lambda': 3.5, 'subsample': 0.840091848974702, 'min_child_weight': 13}

xgbcnu = xgboost.XGBClassifier(**params)
xgbcnu.fit(Xtrain_clip_normal_under, ytrain_under, 
         eval_set=[(Xtrain_clip_normal_under, ytrain_under), (Xvalid_clip_normal, yvalid)],
         early_stopping_rounds=10, verbose=0)
pred = xgbcnu.predict(Xtest_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3656412157153447


In [34]:
# Encoded Imputed Clip Normal Over-Sampled Data With PCA
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 19, 'colsample_bytree': 0.8, 'learning_rate': 0.0021296467556728782, 
          'reg_alpha': 3.0, 'reg_lambda': 1.0, 'subsample': 0.3197178302282616, 'min_child_weight': 19}

xgbcnop = xgboost.XGBClassifier(**params)
xgbcnop.fit(Xtrain_clip_normal_over_pca, ytrain_over, 
          eval_set=[(Xtrain_clip_normal_over_pca, ytrain_over), (Xvalid_clip_normal_pca, yvalid)],
          early_stopping_rounds=10, verbose=0)
pred = xgbcnop.predict(Xtest_clip_normal_pca)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3958592132505176


<h3>Log Clip Normal Data</h3>

In [35]:
# Encoded Imputed Log Clip Normal Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 80, 'colsample_bytree': 0.55, 'learning_rate': 0.001046516488615697, 
          'reg_alpha': 1.5, 'reg_lambda': 1.5, 'subsample': 0.3236465629431227, 'min_child_weight': 20}

xgblcn = xgboost.XGBClassifier(**params)
xgblcn.fit(Xtrain_log_clip_normal, ytrain, 
        eval_set=[(Xtrain_log_clip_normal, ytrain), (Xvalid_log_clip_normal, yvalid)], 
        early_stopping_rounds=10, verbose=0)
pred = xgblcn.predict(Xtest_log_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4004292503449333


In [36]:
# Encoded Imputed Log Clip Normal Over-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 98, 'colsample_bytree': 0.55, 'learning_rate': 0.012449515349836843,
          'reg_alpha': 4.0, 'reg_lambda': 3.0, 'subsample': 0.158065545096134, 'min_child_weight': 17}

xgblcno = xgboost.XGBClassifier(**params)
xgblcno.fit(Xtrain_log_clip_normal_over, ytrain_over, 
         eval_set=[(Xtrain_log_clip_normal_over, ytrain_over), (Xvalid_log_clip_normal, yvalid)], 
         early_stopping_rounds=10, verbose=0)
pred = xgblcno.predict(Xtest_log_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39631336405529954


In [37]:
# Encoded Imputed Log Clip Normal Under-Sampled Data
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 59, 'colsample_bytree': 0.8500000000000001, 'learning_rate': 0.11780385796506398, 
          'reg_alpha': 0.5, 'reg_lambda': 4.5, 'subsample': 0.7744236557697782, 'min_child_weight': 1}

xgblcnu = xgboost.XGBClassifier(**params)
xgblcnu.fit(Xtrain_log_clip_normal_under, ytrain_under, 
         eval_set=[(Xtrain_log_clip_normal_under, ytrain_under), (Xvalid_log_clip_normal, yvalid)],
         early_stopping_rounds=10, verbose=0)
pred = xgblcnu.predict(Xtest_log_clip_normal)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3720707795313247


In [38]:
# Encoded Imputed Log Clip Normal Over-Sampled Data With PCA
params = {'n_estimators': 3000, 'missing': np.nan, 'verbosity': 0, 'scale_pos_weight': 4.3,
          'max_depth': 68, 'colsample_bytree': 0.65, 'learning_rate': 0.0011690060316762662,
          'reg_alpha': 0.5, 'reg_lambda': 3.0, 'subsample': 0.08141001704468165, 'min_child_weight': 1}

xgblcnop = xgboost.XGBClassifier(**params)
xgblcnop.fit(Xtrain_log_clip_normal_over_pca, ytrain_over, 
          eval_set=[(Xtrain_log_clip_normal_over_pca, ytrain_over), (Xvalid_log_clip_normal_pca, yvalid)],
          early_stopping_rounds=10, verbose=0)
pred = xgblcnop.predict(Xtest_log_clip_normal_pca)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3980532786885246
