In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score
import catboost
import optuna
import category_encoders as ce


import warnings
warnings.filterwarnings('ignore')

In [2]:
train_loan = pd.read_csv('credit risk train.csv')

In [3]:
test_loan = pd.read_csv('credit risk test.csv')

In [4]:
X = train_loan.drop(columns='bad_loans')
y = train_loan.bad_loans

In [5]:
# pymnt_plan has only one value
X.drop(columns='pymnt_plan', inplace=True)

In [6]:
# drop columns with collinearity
X.drop(columns=['funded_amnt', 'delinq_2yrs_zero', 'pub_rec'], inplace=True)

In [7]:
# change 'object' types to 'category'
cat_features=['grade', 'home_ownership', 'purpose']
X[cat_features] = X[cat_features].astype('category')

<h1>CatBoost</h1>

<h3 dir='rtl'>
    برای هر مدل، کد زیر در گوگل کولب اجرا و هایپرپارامترهای بهینه به این کد منتقل شده است.
</h3>

<h3>Model For Data Without Preprocessing</h3>

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

In [9]:
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'cat_features': cat_features, 'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 7, 'learning_rate': 0.15316093992253535, 'l2_leaf_reg': 2.5, 'subsample': 0.7426861110728418}

ctbwp = catboost.CatBoostClassifier(**params)
ctbwp.fit(Xtrain, ytrain, 
        eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)])
pred = ctbwp.predict(Xtest)
print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40005678591709254


<h3>Encoding</h3>

In [10]:
cat_features = ['grade', 'home_ownership', 'purpose']
encoder = ce.TargetEncoder(cols=cat_features)
encoder.fit(X[cat_features], y)
X_enc = pd.concat([X, encoder.transform(X[cat_features]).add_suffix('_enc')], axis=1)
X_enc.drop(columns=['grade', 'home_ownership', 'purpose'], inplace=True)

<h3>Handling Missing Values</h3>

In [11]:
X_enc_imp = X_enc.copy()
X_enc_imp.payment_inc_ratio.fillna(0, inplace=True)
X_enc_imp.fillna(-1, inplace=True)

<h3>Train Test Split</h3>

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_enc_imp, y, stratify=y, test_size=0.3)
Xtest, Xvalid, ytest, yvalid =  train_test_split(Xtest, ytest, stratify=ytest, test_size=0.5)

<h3>Handling Outliers</h3>

In [13]:
# Log Transform
########### train #############
Xtrain_log = Xtrain.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtrain_log[c] = np.log(Xtrain_log[c] + 1.1)

########### valid #############
Xvalid_log = Xvalid.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xvalid_log[c] = np.log(Xvalid_log[c] + 1.1)

########### test #############
Xtest_log = Xtest.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtest_log[c] = np.log(Xtest_log[c] + 1.1)

In [14]:
# z-score outlier detection
########### train #############
Xtrain_clip = Xtrain.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',] 

for c in columns:
    if (Xtrain_clip[c].dtypes == int) | (Xtrain_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtrain_clip[c] = Xtrain_clip[c].clip(L,U)

########### valid #############
Xvalid_clip = Xvalid.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xvalid_clip[c].dtypes == int) | (Xvalid_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xvalid_clip[c] =Xvalid_clip[c].clip(L,U)

########### test #############
Xtest_clip = Xtest.copy()
columns = ['loan_amnt', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']# ,'grade',]

for c in columns:
    if (Xtest_clip[c].dtypes == int) | (Xtest_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtest_clip[c] =Xtest_clip[c].clip(L,U)

In [15]:
# Log and z-score
########### train #############
Xtrain_log_clip = Xtrain_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtrain_log_clip = pd.concat([Xtrain_log_clip, Xtrain_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### valid #############
Xvalid_log_clip = Xvalid_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xvalid_log_clip = pd.concat([Xvalid_log_clip, Xvalid_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

########### test #############
Xtest_log_clip = Xtest_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtest_log_clip = pd.concat([Xtest_log_clip, Xtest_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

<h3>Normalization: MinMaxScaler</h3>

In [16]:
# Normalization
mmscaler = MinMaxScaler()

Xtrain_normal = mmscaler.fit_transform(Xtrain)
Xtrain_normal = pd.DataFrame(Xtrain_normal, columns=Xtrain.columns, index=Xtrain.index)

Xvalid_normal = mmscaler.transform(Xvalid)
Xvalid_normal = pd.DataFrame(Xvalid_normal, columns=Xvalid.columns, index=Xvalid.index)

Xtest_normal = mmscaler.transform(Xtest)
Xtest_normal = pd.DataFrame(Xtest_normal, columns=Xtest.columns, index=Xtest.index)

#############################
Xtrain_log_normal = mmscaler.fit_transform(Xtrain_log)
Xtrain_log_normal = pd.DataFrame(Xtrain_log_normal, columns=Xtrain_log.columns, index=Xtrain_log.index)

Xvalid_log_normal = mmscaler.transform(Xvalid_log)
Xvalid_log_normal = pd.DataFrame(Xvalid_log_normal, columns=Xvalid_log.columns, index=Xvalid_log.index)

Xtest_log_normal = mmscaler.transform(Xtest_log)
Xtest_log_normal = pd.DataFrame(Xtest_log_normal, columns=Xtest_log.columns, index=Xtest_log.index)

#############################
Xtrain_clip_normal = mmscaler.fit_transform(Xtrain_clip)
Xtrain_clip_normal = pd.DataFrame(Xtrain_clip_normal, columns=Xtrain_clip.columns, index=Xtrain_clip.index)

Xvalid_clip_normal = mmscaler.transform(Xvalid_clip)
Xvalid_clip_normal = pd.DataFrame(Xvalid_clip_normal, columns=Xvalid_clip.columns, index=Xvalid_clip.index)

Xtest_clip_normal = mmscaler.transform(Xtest_clip)
Xtest_clip_normal = pd.DataFrame(Xtest_clip_normal, columns=Xtest_clip.columns, index=Xtest_clip.index)

#############################
Xtrain_log_clip_normal = mmscaler.fit_transform(Xtrain_log_clip)
Xtrain_log_clip_normal = pd.DataFrame(Xtrain_log_clip_normal, columns=Xtrain_log_clip.columns, index=Xtrain_log_clip.index)

Xvalid_log_clip_normal = mmscaler.transform(Xvalid_log_clip)
Xvalid_log_clip_normal = pd.DataFrame(Xvalid_log_clip_normal, columns=Xvalid_log_clip.columns, index=Xvalid_log_clip.index)

Xtest_log_clip_normal = mmscaler.transform(Xtest_log_clip)
Xtest_log_clip_normal = pd.DataFrame(Xtest_log_clip_normal, columns=Xtest_log_clip.columns, index=Xtest_log_clip.index)

<h3>Over Sampling</h3>

In [17]:
# Over-Sampling
over_sampler = SMOTE()

########### train #############
Xtrain_over, ytrain_over = over_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_over, ytrain_normal_over = over_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_over, ytrain_log_normal_over = over_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_over, ytrain_clip_normal_over = over_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over = over_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>Under Sampling</h3>

In [18]:
# Under-Sampling
under_sampler = RandomUnderSampler()

########### train #############
Xtrain_under, ytrain_under = under_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_under, ytrain_normal_under = under_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_under, ytrain_log_normal_under = under_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_under, ytrain_clip_normal_under = under_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under = under_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)

<h3>PCA</h3>

In [19]:
# PCA with Over Sampled Data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)

Xtrain_pca = pd.DataFrame(pca.fit_transform(Xtrain_over), index=Xtrain_over.index)
Xtrain_over_pca = pd.concat([Xtrain_over, Xtrain_pca], axis=1)
Xtrain_over_pca.columns=Xtrain_over_pca.columns.astype(str)

Xvalid_pca = pd.DataFrame(pca.transform(Xvalid), index=Xvalid.index)
Xvalid_pca = pd.concat([Xvalid, Xvalid_pca], axis=1)
Xvalid_pca.columns = Xvalid_pca.columns.astype(str)

Xtest_pca = pd.DataFrame(pca.transform(Xtest), index=Xtest.index)
Xtest_pca = pd.concat([Xtest, Xtest_pca], axis=1)
Xtest_pca.columns = Xtest_pca.columns.astype(str)

###########################################
Xtrain_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_normal_over), index=Xtrain_normal_over.index)
Xtrain_normal_over_pca = pd.concat([Xtrain_normal_over, Xtrain_normal_pca], axis=1)
Xtrain_normal_over_pca.columns=Xtrain_normal_over_pca.columns.astype(str)

Xvalid_normal_pca = pd.DataFrame(pca.transform(Xvalid_normal), index=Xvalid_normal.index)
Xvalid_normal_pca = pd.concat([Xvalid_normal, Xvalid_normal_pca], axis=1)
Xvalid_normal_pca.columns = Xvalid_normal_pca.columns.astype(str)

Xtest_normal_pca = pd.DataFrame(pca.transform(Xtest_normal), index=Xtest_normal.index)
Xtest_normal_pca = pd.concat([Xtest_normal, Xtest_normal_pca], axis=1)
Xtest_normal_pca.columns = Xtest_normal_pca.columns.astype(str)

###########################################
Xtrain_log_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_normal_over), index=Xtrain_log_normal_over.index)
Xtrain_log_normal_over_pca = pd.concat([Xtrain_log_normal_over, Xtrain_log_normal_pca], axis=1)
Xtrain_log_normal_over_pca.columns=Xtrain_log_normal_over_pca.columns.astype(str)

Xvalid_log_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_normal), index=Xvalid_log_normal.index)
Xvalid_log_normal_pca = pd.concat([Xvalid_log_normal, Xvalid_log_normal_pca], axis=1)
Xvalid_log_normal_pca.columns = Xvalid_log_normal_pca.columns.astype(str)

Xtest_log_normal_pca = pd.DataFrame(pca.transform(Xtest_log_normal), index=Xtest_log_normal.index)
Xtest_log_normal_pca = pd.concat([Xtest_log_normal, Xtest_log_normal_pca], axis=1)
Xtest_log_normal_pca.columns = Xtest_log_normal_pca.columns.astype(str)

###########################################
Xtrain_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_clip_normal_over), index=Xtrain_clip_normal_over.index)
Xtrain_clip_normal_over_pca = pd.concat([Xtrain_clip_normal_over, Xtrain_clip_normal_pca], axis=1)
Xtrain_clip_normal_over_pca.columns=Xtrain_clip_normal_over_pca.columns.astype(str)

Xvalid_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_clip_normal), index=Xvalid_clip_normal.index)
Xvalid_clip_normal_pca = pd.concat([Xvalid_clip_normal, Xvalid_clip_normal_pca], axis=1)
Xvalid_clip_normal_pca.columns=Xvalid_clip_normal_pca.columns.astype(str)

Xtest_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_clip_normal), index=Xtest_clip_normal.index)
Xtest_clip_normal_pca = pd.concat([Xtest_clip_normal, Xtest_clip_normal_pca], axis=1)
Xtest_clip_normal_pca.columns=Xtest_clip_normal_pca.columns.astype(str)

###########################################
Xtrain_log_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_clip_normal_over), index=Xtrain_log_clip_normal_over.index)
Xtrain_log_clip_normal_over_pca = pd.concat([Xtrain_log_clip_normal_over, Xtrain_log_clip_normal_pca], axis=1)
Xtrain_log_clip_normal_over_pca.columns=Xtrain_log_clip_normal_over_pca.columns.astype(str)

Xvalid_log_clip_normal_pca = pd.DataFrame(pca.transform(Xvalid_log_clip_normal), index=Xvalid_log_clip_normal.index)
Xvalid_log_clip_normal_pca = pd.concat([Xvalid_log_clip_normal, Xvalid_log_clip_normal_pca], axis=1)
Xvalid_log_clip_normal_pca.columns=Xvalid_log_clip_normal_pca.columns.astype(str)

Xtest_log_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_log_clip_normal), index=Xtest_log_clip_normal.index)
Xtest_log_clip_normal_pca = pd.concat([Xtest_log_clip_normal, Xtest_log_clip_normal_pca], axis=1)
Xtest_log_clip_normal_pca.columns=Xtest_log_clip_normal_pca.columns.astype(str)


<h3>Train Data</h3>

In [20]:
# Encoded Imputed Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 7, 'learning_rate': 0.10532310426567097, 'l2_leaf_reg': 3.0, 'subsample': 0.5940099292935849}

ctb = catboost.CatBoostClassifier(**params)
ctb.fit(Xtrain, ytrain, 
        eval_set=[(Xtrain, ytrain), (Xvalid, yvalid)])
pred = ctb.predict(Xtest)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40599571734475376


In [21]:
# Encoded Imputed Over-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 6, 'learning_rate': 0.4775873289833, 'l2_leaf_reg': 4.0, 'subsample': 0.3038167762741147}

ctbo = catboost.CatBoostClassifier(**params)
ctbo.fit(Xtrain_over, ytrain_over, 
        eval_set=[(Xtrain_over, ytrain_over), (Xvalid, yvalid)])
pred = ctbo.predict(Xtest)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40118679050567596


In [22]:
# Encoded Imputed Under-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 16, 'learning_rate': 0.015340105258056928, 'l2_leaf_reg': 0.0, 'subsample': 0.10129304208676515}

ctbu = catboost.CatBoostClassifier(**params)
ctbu.fit(Xtrain_under, ytrain_under, 
        eval_set=[(Xtrain_under, ytrain_under), (Xvalid, yvalid)])
pred = ctbu.predict(Xtest)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3493621197252208


In [23]:
# Encoded Imputed Over-Sampled Data With PCA
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 8, 'learning_rate': 0.18412509379837513, 'l2_leaf_reg': 1.0, 'subsample': 0.9234587963151217}

ctbup = catboost.CatBoostClassifier(**params)
ctbup.fit(Xtrain_over_pca, ytrain_over, 
        eval_set=[(Xtrain_over_pca, ytrain_over), (Xvalid_pca, yvalid)])
pred = ctbup.predict(Xtest_pca)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40309477756286266


<h3>Normal Data</h3>

In [24]:
# Encoded Imputed Normal Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 8, 'learning_rate': 0.20616024887081802, 'l2_leaf_reg': 4.5, 'subsample': 0.1371963740854683}

ctbn = catboost.CatBoostClassifier(**params)
ctbn.fit(Xtrain_normal, ytrain, 
        eval_set=[(Xtrain_normal, ytrain), (Xvalid_normal, yvalid)])
pred = ctbn.predict(Xtest_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40351129658943735


In [25]:
# Encoded Imputed Normal Over-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 10, 'learning_rate': 0.15858019852030955, 'l2_leaf_reg': 4.5, 'subsample': 0.8756246117352564}

ctbno = catboost.CatBoostClassifier(**params)
ctbno.fit(Xtrain_normal_over, ytrain_over,
        eval_set=[(Xtrain_normal_over, ytrain_over), (Xvalid_normal, yvalid)])
pred = ctbno.predict(Xtest_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3999535477877134


In [26]:
# Encoded Imputed Normal Under-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 13, 'learning_rate': 0.412115286105057, 'l2_leaf_reg': 3.0, 'subsample': 0.12605726739167156}

ctbnu = catboost.CatBoostClassifier(**params)
ctbnu.fit(Xtrain_normal_under, ytrain_under, 
        eval_set=[(Xtrain_normal_under, ytrain_under), (Xvalid_normal, yvalid)])
pred = ctbnu.predict(Xtest_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3375359080753272


In [27]:
# Encoded Imputed Normal Over-Sampled Data With PCA
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 3, 'learning_rate': 0.3857831011216575, 'l2_leaf_reg': 4.5, 'subsample': 0.7983547956732875}

ctbnup = catboost.CatBoostClassifier(**params)
ctbnup.fit(Xtrain_normal_over_pca, ytrain_over, 
        eval_set=[(Xtrain_normal_over_pca, ytrain_over), (Xvalid_normal_pca, yvalid)])
pred = ctbnup.predict(Xtest_normal_pca)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39650655021834064


<h3>Log Normal Data</h3>

In [28]:
# Encoded Imputed Log Normal Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 10, 'learning_rate': 0.029089933034664905, 'l2_leaf_reg': 4.0, 'subsample': 0.39770498756871575}

ctbln = catboost.CatBoostClassifier(**params)
ctbln.fit(Xtrain_log_normal, ytrain, 
        eval_set=[(Xtrain_log_normal, ytrain), (Xvalid_log_normal, yvalid)])
pred = ctbln.predict(Xtest_log_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40257879656160456


In [29]:
# Encoded Imputed Log Normal Over-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 6, 'learning_rate': 0.3059884739946264, 'l2_leaf_reg': 2.5, 'subsample': 0.6082234258834167}

ctblno = catboost.CatBoostClassifier(**params)
ctblno.fit(Xtrain_log_normal_over, ytrain_over, 
        eval_set=[(Xtrain_log_normal_over, ytrain_over), (Xvalid_log_normal, yvalid)])
pred = ctblno.predict(Xtest_log_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3986663491307454


In [30]:
# Encoded Imputed Log Normal Under-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 16, 'learning_rate': 0.06236318730089297, 'l2_leaf_reg': 0.0, 'subsample': 0.40129061052243625}

ctblnu = catboost.CatBoostClassifier(**params)
ctblnu.fit(Xtrain_log_normal_under, ytrain_under, 
        eval_set=[(Xtrain_log_normal_under, ytrain_under), (Xvalid_log_normal, yvalid)])
pred = ctblnu.predict(Xtest_log_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.34486764583851964


In [31]:
# Encoded Imputed Log Normal Over-Sampled Data With PCA
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 3, 'learning_rate': 0.3619557233731303, 'l2_leaf_reg': 1.5, 'subsample': 0.8188688172749383}

ctblnup = catboost.CatBoostClassifier(**params)
ctblnup.fit(Xtrain_log_normal_over_pca, ytrain_over, 
        eval_set=[(Xtrain_log_normal_over_pca, ytrain_over), (Xvalid_log_normal_pca, yvalid)])
pred = ctblnup.predict(Xtest_log_normal_pca)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3992405063291139


<h3>Clip Normal Data</h3>

In [32]:
# Encoded Imputed Clip Normal Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 9, 'learning_rate': 0.12096442887168164, 'l2_leaf_reg': 5.0, 'subsample': 0.4213844514480115}

ctbcn = catboost.CatBoostClassifier(**params)
ctbcn.fit(Xtrain_clip_normal, ytrain, 
        eval_set=[(Xtrain_clip_normal, ytrain), (Xvalid_clip_normal, yvalid)])
pred = ctbcn.predict(Xtest_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.4023788801856687


In [33]:
# Encoded Imputed Clip Normal Over-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 11, 'learning_rate': 0.314035696592347, 'l2_leaf_reg': 3.0, 'subsample': 0.8327384087309043}

ctbcno = catboost.CatBoostClassifier(**params)
ctbcno.fit(Xtrain_clip_normal_over, ytrain_over, 
        eval_set=[(Xtrain_clip_normal_over, ytrain_over), (Xvalid_clip_normal, yvalid)])
pred = ctbcno.predict(Xtest_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39462783983933725


In [34]:
# Encoded Imputed Clip Normal Under-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 16, 'learning_rate': 0.03599976137131276, 'l2_leaf_reg': 0.0, 'subsample': 0.17932129647949957}

ctbcnu = catboost.CatBoostClassifier(**params)
ctbcnu.fit(Xtrain_clip_normal_under, ytrain_under, 
        eval_set=[(Xtrain_clip_normal_under, ytrain_under), (Xvalid_clip_normal, yvalid)])
pred = ctbcnu.predict(Xtest_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.34441489361702127


In [35]:
# Encoded Imputed Clip Normal Over-Sampled Data With PCA
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 11, 'learning_rate': 0.136990569882088, 'l2_leaf_reg': 2.0, 'subsample': 0.4608627723920744}

ctbcnup = catboost.CatBoostClassifier(**params)
ctbcnup.fit(Xtrain_clip_normal_over_pca, ytrain_over, 
        eval_set=[(Xtrain_clip_normal_over_pca, ytrain_over), (Xvalid_clip_normal_pca, yvalid)])
pred = ctbcnup.predict(Xtest_clip_normal_pca)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.38942023851694413


<h3>Log Clip Normal Data</h3>

In [36]:
# Encoded Imputed Log Clip Normal Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 10, 'learning_rate': 0.05436657304205132, 'l2_leaf_reg': 5.0, 'subsample': 0.309170412379573}

ctblcn = catboost.CatBoostClassifier(**params)
ctblcn.fit(Xtrain_log_clip_normal, ytrain, 
        eval_set=[(Xtrain_log_clip_normal, ytrain), (Xvalid_log_clip_normal, yvalid)])
pred = ctblcn.predict(Xtest_log_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3998838391171773


In [37]:
# Encoded Imputed Log Clip Normal Over-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 5, 'learning_rate': 0.4976213484770711, 'l2_leaf_reg': 2.5, 'subsample': 0.8997800192288538}

ctblcno = catboost.CatBoostClassifier(**params)
ctblcno.fit(Xtrain_log_clip_normal_over, ytrain_over,
        eval_set=[(Xtrain_log_clip_normal_over, ytrain_over), (Xvalid_log_clip_normal, yvalid)])
pred = ctblcno.predict(Xtest_log_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.39481337857489096


In [38]:
# Encoded Imputed Log Clip Normal Under-Sampled Data
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 15, 'learning_rate': 0.022006525526220063, 'l2_leaf_reg': 0.0, 'subsample': 0.08605210137366717}

ctblcnu = catboost.CatBoostClassifier(**params)
ctblcnu.fit(Xtrain_log_clip_normal_under, ytrain_under,
        eval_set=[(Xtrain_log_clip_normal_under, ytrain_under), (Xvalid_log_clip_normal, yvalid)])
pred = ctblcnu.predict(Xtest_log_clip_normal)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.3457168184382303


In [39]:
# Encoded Imputed Log Clip Normal Over-Sampled Data With PCA
params = {'iterations': 100, 'verbose': 0, 'od_type': 'IncToDec', 'od_wait': 10, 'boosting_type': 'Plain',
          'leaf_estimation_method': 'Newton', 'class_weights': {0:0.2, 1:0.8},
          'depth': 5, 'learning_rate': 0.3203655334634296, 'l2_leaf_reg': 4.0, 'subsample': 0.998959960527965}

ctblcnup = catboost.CatBoostClassifier(**params)
ctblcnup.fit(Xtrain_log_clip_normal_over_pca, ytrain_over,
        eval_set=[(Xtrain_log_clip_normal_over_pca, ytrain_over), (Xvalid_log_clip_normal_pca, yvalid)])
pred = ctblcnup.predict(Xtest_log_clip_normal_pca)

print('Xtest result: ', f1_score(y_true=ytest, y_pred=pred))

Xtest result:  0.40101707228478023
