In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings('ignore')

In [32]:
train_loan = pd.read_csv('credit risk train.csv')

In [38]:
X = train_loan.drop(columns='bad_loans')
y = train_loan.bad_loans

In [40]:
# pymnt_plan has only one value
X.drop(columns='pymnt_plan', inplace=True)

In [41]:
# drop columns with collinearity
X.drop(columns=['funded_amnt', 'delinq_2yrs_zero', 'pub_rec'], inplace=True)

In [42]:
le_grade = LabelEncoder()
X.grade = le_grade.fit_transform(X.grade)
# X.head()

In [43]:
X = pd.get_dummies(X, columns=['home_ownership', 'purpose'], drop_first=True, dtype=int)
# X.head()

In [44]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.20)

<h1>Handling Missing Value</h1>

In [45]:
Xtrain.payment_inc_ratio.fillna(0, inplace=True)
Xtrain.fillna(-1, inplace=True)

In [46]:
Xtest.payment_inc_ratio.fillna(0, inplace=True)
Xtest.fillna(-1, inplace=True)

<h1>Handling Outliers</h1>

In [47]:
# Log Transform
Xtrain_log = Xtrain.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtrain_log[c] = np.log(Xtrain_log[c] + 1.1)

Xtest_log = Xtest.copy()
cols = ['payment_inc_ratio', 'open_acc']
for c in cols:
    Xtest_log[c] = np.log(Xtest_log[c] + 1.1)

In [48]:
# z-score outlier detection
Xtrain_clip = Xtrain.copy()
columns = ['loan_amnt', 'grade', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']

for c in columns:
    if (Xtrain_clip[c].dtypes == int) | (Xtrain_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtrain_clip[c] = Xtrain_clip[c].clip(L,U)

Xtest_clip = Xtest.copy()
columns = ['loan_amnt', 'grade', 'sub_grade_num', 'emp_length_num', 'dti', 'delinq_2yrs',
           'inq_last_6mths', 'revol_util', 'payment_inc_ratio', 'open_acc']

for c in columns:
    if (Xtest_clip[c].dtypes == int) | (Xtest_clip[c].dtypes == float):
        L = Xtrain[c].mean() - 3*Xtrain[c].std()
        U = Xtrain[c].mean() + 3*Xtrain[c].std()
        Xtest_clip[c] =Xtest_clip[c].clip(L,U)

In [49]:
# Log and z-score
Xtrain_log_clip = Xtrain_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtrain_log_clip = pd.concat([Xtrain_log_clip, Xtrain_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

Xtest_log_clip = Xtest_clip.drop(columns=['payment_inc_ratio', 'open_acc'])
Xtest_log_clip = pd.concat([Xtest_log_clip, Xtest_log.loc[:,['payment_inc_ratio', 'open_acc']]], axis=1)

<h1>Normalization: MinMaxScaler</h1>

In [50]:
# Normalization
mmscaler = MinMaxScaler()

Xtrain_normal = mmscaler.fit_transform(Xtrain)
Xtrain_normal = pd.DataFrame(Xtrain_normal, columns=Xtrain.columns, index=Xtrain.index)

Xtest_normal = mmscaler.transform(Xtest)
Xtest_normal = pd.DataFrame(Xtest_normal, columns=Xtest.columns, index=Xtest.index)

#############################
Xtrain_log_normal = mmscaler.fit_transform(Xtrain_log)
Xtrain_log_normal = pd.DataFrame(Xtrain_log_normal, columns=Xtrain_log.columns, index=Xtrain_log.index)

Xtest_log_normal = mmscaler.transform(Xtest_log)
Xtest_log_normal = pd.DataFrame(Xtest_log_normal, columns=Xtest_log.columns, index=Xtest_log.index)

#############################
Xtrain_clip_normal = mmscaler.fit_transform(Xtrain_clip)
Xtrain_clip_normal = pd.DataFrame(Xtrain_clip_normal, columns=Xtrain_clip.columns, index=Xtrain_clip.index)

Xtest_clip_normal = mmscaler.transform(Xtest_clip)
Xtest_clip_normal = pd.DataFrame(Xtest_clip_normal, columns=Xtest_clip.columns, index=Xtest_clip.index)

#############################
Xtrain_log_clip_normal = mmscaler.fit_transform(Xtrain_log_clip)
Xtrain_log_clip_normal = pd.DataFrame(Xtrain_log_clip_normal, columns=Xtrain_log_clip.columns, index=Xtrain_log_clip.index)

Xtest_log_clip_normal = mmscaler.transform(Xtest_log_clip)
Xtest_log_clip_normal = pd.DataFrame(Xtest_log_clip_normal, columns=Xtest_log_clip.columns, index=Xtest_log_clip.index)


<h1>Over Sampling</h1>

In [51]:
# Over-Sampling
over_sampler = SMOTE()

Xtrain_over, ytrain_over = over_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_over, ytrain_normal_over = over_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_over, ytrain_log_normal_over = over_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_over, ytrain_clip_normal_over = over_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over = over_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)


<h1>Under Sampling</h1>

In [52]:
# Under-Sampling
under_sampler = RandomUnderSampler()

Xtrain_under, ytrain_under = under_sampler.fit_resample(Xtrain, ytrain)
Xtrain_normal_under, ytrain_normal_under = under_sampler.fit_resample(Xtrain_normal, ytrain)
Xtrain_log_normal_under, ytrain_log_normal_under = under_sampler.fit_resample(Xtrain_log_normal, ytrain)
Xtrain_clip_normal_under, ytrain_clip_normal_under = under_sampler.fit_resample(Xtrain_clip_normal, ytrain)
Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under = under_sampler.fit_resample(Xtrain_log_clip_normal, ytrain)


<h1>PCA</h1>

In [54]:
# PCA with Over Sampled Data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)

Xtrain_pca = pd.DataFrame(pca.fit_transform(Xtrain_over), index=Xtrain_over.index)
Xtrain_over_pca = pd.concat([Xtrain_over, Xtrain_pca], axis=1)
Xtrain_over_pca.columns=Xtrain_over_pca.columns.astype(str)

Xtest_pca = pd.DataFrame(pca.transform(Xtest), index=Xtest.index)
Xtest_pca = pd.concat([Xtest, Xtest_pca], axis=1)
Xtest_pca.columns = Xtest_pca.columns.astype(str)

###########################################
Xtrain_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_normal_over), index=Xtrain_normal_over.index)
Xtrain_normal_over_pca = pd.concat([Xtrain_normal_over, Xtrain_normal_pca], axis=1)
Xtrain_normal_over_pca.columns=Xtrain_normal_over_pca.columns.astype(str)

Xtest_normal_pca = pd.DataFrame(pca.transform(Xtest_normal), index=Xtest_normal.index)
Xtest_normal_pca = pd.concat([Xtest_normal, Xtest_normal_pca], axis=1)
Xtest_normal_pca.columns = Xtest_normal_pca.columns.astype(str)

###########################################
Xtrain_log_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_normal_over), index=Xtrain_log_normal_over.index)
Xtrain_log_normal_over_pca = pd.concat([Xtrain_log_normal_over, Xtrain_log_normal_pca], axis=1)
Xtrain_log_normal_over_pca.columns=Xtrain_log_normal_over_pca.columns.astype(str)

Xtest_log_normal_pca = pd.DataFrame(pca.transform(Xtest_log_normal), index=Xtest_log_normal.index)
Xtest_log_normal_pca = pd.concat([Xtest_log_normal, Xtest_log_normal_pca], axis=1)
Xtest_log_normal_pca.columns = Xtest_log_normal_pca.columns.astype(str)

###########################################
Xtrain_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_clip_normal_over), index=Xtrain_clip_normal_over.index)
Xtrain_clip_normal_over_pca = pd.concat([Xtrain_clip_normal_over, Xtrain_clip_normal_pca], axis=1)
Xtrain_clip_normal_over_pca.columns=Xtrain_clip_normal_over_pca.columns.astype(str)

Xtest_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_clip_normal), index=Xtest_clip_normal.index)
Xtest_clip_normal_pca = pd.concat([Xtest_clip_normal, Xtest_clip_normal_pca], axis=1)
Xtest_clip_normal_pca.columns=Xtest_clip_normal_pca.columns.astype(str)

###########################################
Xtrain_log_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_clip_normal_over), index=Xtrain_log_clip_normal_over.index)
Xtrain_log_clip_normal_over_pca = pd.concat([Xtrain_log_clip_normal_over, Xtrain_log_clip_normal_pca], axis=1)
Xtrain_log_clip_normal_over_pca.columns=Xtrain_log_clip_normal_over_pca.columns.astype(str)

Xtest_log_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_log_clip_normal), index=Xtest_log_clip_normal.index)
Xtest_log_clip_normal_pca = pd.concat([Xtest_log_clip_normal, Xtest_log_clip_normal_pca], axis=1)
Xtest_log_clip_normal_pca.columns=Xtest_log_clip_normal_pca.columns.astype(str)


In [53]:
# PCA with Under Sampled Data
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99)

Xtrain_pca = pd.DataFrame(pca.fit_transform(Xtrain_under), index=Xtrain_under.index)
Xtrain_under_pca = pd.concat([Xtrain_under, Xtrain_pca], axis=1)
Xtrain_under_pca.columns=Xtrain_under_pca.columns.astype(str)

Xtest_pca = pd.DataFrame(pca.transform(Xtest), index=Xtest.index)
Xtest_pca = pd.concat([Xtest, Xtest_pca], axis=1)
Xtest_pca.columns = Xtest_pca.columns.astype(str)

###########################################
Xtrain_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_normal_under), index=Xtrain_normal_under.index)
Xtrain_normal_under_pca = pd.concat([Xtrain_normal_under, Xtrain_normal_pca], axis=1)
Xtrain_normal_under_pca.columns=Xtrain_normal_under_pca.columns.astype(str)

Xtest_normal_pca = pd.DataFrame(pca.transform(Xtest_normal), index=Xtest_normal.index)
Xtest_normal_pca = pd.concat([Xtest_normal, Xtest_normal_pca], axis=1)
Xtest_normal_pca.columns = Xtest_normal_pca.columns.astype(str)

###########################################
Xtrain_log_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_normal_under), index=Xtrain_log_normal_under.index)
Xtrain_log_normal_under_pca = pd.concat([Xtrain_log_normal_under, Xtrain_log_normal_pca], axis=1)
Xtrain_log_normal_under_pca.columns=Xtrain_log_normal_under_pca.columns.astype(str)

Xtest_log_normal_pca = pd.DataFrame(pca.transform(Xtest_log_normal), index=Xtest_log_normal.index)
Xtest_log_normal_pca = pd.concat([Xtest_log_normal, Xtest_log_normal_pca], axis=1)
Xtest_log_normal_pca.columns = Xtest_log_normal_pca.columns.astype(str)

###########################################
Xtrain_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_clip_normal_under), index=Xtrain_clip_normal_under.index)
Xtrain_clip_normal_under_pca = pd.concat([Xtrain_clip_normal_under, Xtrain_clip_normal_pca], axis=1)
Xtrain_clip_normal_under_pca.columns=Xtrain_clip_normal_under_pca.columns.astype(str)

Xtest_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_clip_normal), index=Xtest_clip_normal.index)
Xtest_clip_normal_pca = pd.concat([Xtest_clip_normal, Xtest_clip_normal_pca], axis=1)
Xtest_clip_normal_pca.columns=Xtest_clip_normal_pca.columns.astype(str)

###########################################
Xtrain_log_clip_normal_pca = pd.DataFrame(pca.fit_transform(Xtrain_log_clip_normal_under), index=Xtrain_log_clip_normal_under.index)
Xtrain_log_clip_normal_under_pca = pd.concat([Xtrain_log_clip_normal_under, Xtrain_log_clip_normal_pca], axis=1)
Xtrain_log_clip_normal_under_pca.columns=Xtrain_log_clip_normal_under_pca.columns.astype(str)

Xtest_log_clip_normal_pca = pd.DataFrame(pca.transform(Xtest_log_clip_normal), index=Xtest_log_clip_normal.index)
Xtest_log_clip_normal_pca = pd.concat([Xtest_log_clip_normal, Xtest_log_clip_normal_pca], axis=1)
Xtest_log_clip_normal_pca.columns=Xtest_log_clip_normal_pca.columns.astype(str)

<h1>Stratified K-Fold</h1>

In [55]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

<h1>Gaussian Naive Bayes</h1>

<h4 dir='rtl'>بدلیل نداشتن هایپرپارامتر در این الگوریتم، از RandomizedSearchCV برای اعمال Cross Validation استفاده شده است.</h4>

<h3>Train Data</h3>

In [25]:
gnb = GaussianNB()

In [27]:
# Xtrain
space = {}
gnbrs = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrs.fit(Xtrain, ytrain)

gnb_pred = gnbrs.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=gnb_pred)

0.2500548365869708

In [28]:
# Over-Samlping
space = {}
gnbrso = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrso.fit(Xtrain_over, ytrain_over)

gnbo_pred = gnbrso.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=gnbo_pred)

0.33981430045234506

In [29]:
# Under-Sampling
space = {}
gnbrsu = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrsu.fit(Xtrain_under, ytrain_under)

gnbu_pred = gnbrsu.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=gnbu_pred)

0.3917061077304485

In [107]:
# Over PCA
space = {}
gnbrsop = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
gnbrsop.fit(Xtrain_over_pca, ytrain_over)

gnbop_pred = gnbrsop.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=gnbop_pred)

0.34031582339671285

In [31]:
# Under PCA
space = {}
gnbrsup = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
gnbrsup.fit(Xtrain_under_pca, ytrain_under)

gnbup_pred = gnbrsup.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=gnbup_pred)

0.3857810185712658

<h3>Train Normal Data</h3>

In [34]:
gnb = GaussianNB()

In [32]:
# Xtrain Normal
space = {}
gnbrsn = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrsn.fit(Xtrain_normal, ytrain)

gnbn_pred = gnbrsn.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=gnbn_pred)

0.2571025589361273

In [33]:
# Normal Over Samlping
space = {}
gnbrsno = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrsno.fit(Xtrain_normal_over, ytrain_normal_over)

gnbno_pred = gnbrsno.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=gnbno_pred)

0.3820335636722606

In [34]:
# Normal Under Sampling
space = {}
gnbrsnu = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrsnu.fit(Xtrain_normal_under, ytrain_normal_under)

gnbnu_pred = gnbrsnu.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=gnbnu_pred)

0.3690137359237718

In [108]:
# Over PCA
space = {}
gnbrsnop = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
gnbrsnop.fit(Xtrain_normal_over_pca, ytrain_normal_over)

gnbnop_pred = gnbrsnop.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=gnbnop_pred)

0.38572884811416924

In [36]:
# Under PCA
space = {}
gnbrsnup = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
gnbrsnup.fit(Xtrain_normal_under_pca, ytrain_normal_under)

gnbnup_pred = gnbrsnup.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=gnbnup_pred)

0.3810923340680872

<h3>Train Log Normal Data</h3>

In [37]:
gnb = GaussianNB()

In [38]:
# Xtrain Log Normal
space = {}
gnbrsln = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrsln.fit(Xtrain_log_normal, ytrain)

gnbln_pred = gnbrsln.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=gnbln_pred)

0.24959016393442623

In [39]:
# Log Normal Over Samlping
space = {}
gnbrslno = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslno.fit(Xtrain_log_normal_over, ytrain_log_normal_over)

gnblno_pred = gnbrslno.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=gnblno_pred)

0.37832588852204313

In [40]:
# Log Normal Under Sampling
space = {}
gnbrslnu = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslnu.fit(Xtrain_log_normal_under, ytrain_log_normal_under)

gnblnu_pred = gnbrslnu.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=gnblnu_pred)

0.37482646922720964

In [109]:
# Over PCA
space = {}
gnbrslnop = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslnop.fit(Xtrain_log_normal_over_pca, ytrain_log_normal_over)

gnblnop_pred = gnbrslnop.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=gnblnop_pred)

0.3826086956521739

In [41]:
# Under PCA
space = {}
gnbrslnup = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslnup.fit(Xtrain_log_normal_under_pca, ytrain_log_normal_under)

gnblnup_pred = gnbrslnup.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=gnblnup_pred)

0.3851381966470322

<h3>Train Clip Normal Data</h3>

In [42]:
gnb = GaussianNB()

In [43]:
# Xtrain Clip Normal
space = {}
gnbrscn = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrscn.fit(Xtrain_clip_normal, ytrain)

gnbcn_pred = gnbrscn.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=gnbcn_pred)

0.2621105527638191

In [44]:
# Clip Normal Over Samlping
space = {}
gnbrscno = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrscno.fit(Xtrain_clip_normal_over, ytrain_clip_normal_over)

gnbcno_pred = gnbrscno.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=gnbcno_pred)

0.3828197945845005

In [45]:
# Clip Normal Under Sampling
space = {}
gnbrscnu = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrscnu.fit(Xtrain_clip_normal_under, ytrain_clip_normal_under)

gnbcnu_pred = gnbrscnu.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=gnbcnu_pred)

0.36278342455043

In [110]:
# Over PCA
space = {}
gnbrscnop = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrscnop.fit(Xtrain_clip_normal_over_pca, ytrain_clip_normal_over)

gnbcnop_pred = gnbrscnop.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=gnbcnop_pred)

0.3840744570837642

In [46]:
# Under PCA
space = {}
gnbrscnup = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrscnup.fit(Xtrain_clip_normal_under_pca, ytrain_clip_normal_under)

gnbcnup_pred = gnbrscnup.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=gnbcnup_pred)

0.37731653076352856

<h3>Train Log Clip Normal Data</h3>

In [47]:
gnb = GaussianNB()

In [48]:
# Xtrain Clip Normal
space = {}
gnbrslcn = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslcn.fit(Xtrain_log_clip_normal, ytrain)

gnblcn_pred = gnbrslcn.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=gnblcn_pred)

0.25467241733415485

In [49]:
# Clip Normal Over Samlping
space = {}
gnbrslcno = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslcno.fit(Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over)

gnblcno_pred = gnbrslcno.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=gnblcno_pred)

0.3787200933670492

In [50]:
# Clip Normal Under Sampling
space = {}
gnbrslcnu = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslcnu.fit(Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under)

gnblcnu_pred = gnbrslcnu.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=gnblcnu_pred)

0.37777529844917995

In [111]:
# Over PCA
space = {}
gnbrslcnop = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslcnop.fit(Xtrain_log_clip_normal_over_pca, ytrain_log_clip_normal_over)

gnblcnop_pred = gnbrslcnop.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=gnblcnop_pred)

0.38261552045521435

In [51]:
# Under PCA
space = {}
gnbrslcnup = RandomizedSearchCV(estimator=gnb, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
gnbrslcnup.fit(Xtrain_log_clip_normal_under_pca, ytrain_log_clip_normal_under)

gnblcnup_pred = gnbrslcnup.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=gnblcnup_pred)

0.38564329099944783

<h1>KNN Classifier</h1>

<h4 dir='rtl'>پارامترهای بهینه تمام مدل ها با استفاده از RandomizedSearchCV بدست آمده و با همان پارامترهای بهینه مدلسازی نهایی انجام شده است.</h4>

<h3>Train Data</h3>

In [52]:
knn = KNeighborsClassifier()

In [53]:
# Xtrain
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['distance']}
knnrs = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrs.fit(Xtrain, ytrain)
# print(knnrs.best_score_)
# print(knnrs.best_params_)

knn_pred = knnrs.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=knn_pred)

0.22076171216717222

In [54]:
# Over Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrso = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrso.fit(Xtrain_over, ytrain_over)
# print(knnrso.best_score_)
# print(knnrso.best_params_)

knno_pred = knnrso.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=knno_pred)

0.2580560305142707

In [55]:
# Under Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [27], 'weights' : ['uniform']}
knnrsu = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsu.fit(Xtrain_under, ytrain_under)
# print(knnrsu.best_score_)
# print(knnrsu.best_params_)

knnu_pred = knnrsu.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=knnu_pred)

0.3267550319096711

In [112]:
# Over PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsop = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsop.fit(Xtrain_over_pca, ytrain_over)
# print(knnrsop.best_score_)
# print(knnrsop.best_params_)

knnop_pred = knnrsop.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=knnop_pred)

0.25643055005935894

In [58]:
# Under PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsup = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsup.fit(Xtrain_under_pca, ytrain_under)
# print(knnrsup.best_score_)
# print(knnrsup.best_params_)

knnup_pred = knnrsup.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=knnup_pred)

0.2979394839428253

<h3>Train Normal Data</h3>

In [59]:
knn = KNeighborsClassifier()

In [60]:
# Xtrain Normal
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsn = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsn.fit(Xtrain_normal, ytrain)
# print(knnrsn.best_score_)
# print(knnrsn.best_params_)

knnn_pred = knnrsn.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=knnn_pred)

0.2491716368455931

In [61]:
# Normal Over Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsno = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsno.fit(Xtrain_normal_over, ytrain_normal_over)
# print(knnrsno.best_score_)
# print(knnrsno.best_params_)

knnno_pred = knnrsno.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=knnno_pred)

0.2831688596491228

In [62]:
# Normal Under Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [60], 'weights' : ['distance']}
knnrsnu = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsnu.fit(Xtrain_normal_under, ytrain_normal_under)
# print(knnrsnu.best_score_)
# print(knnrsnu.best_params_)

knnnu_pred = knnrsnu.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=knnnu_pred)

0.37330892076085853

In [113]:
# Over PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsnop = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsnop.fit(Xtrain_normal_over_pca, ytrain_normal_over)
# print(knnrsnop.best_score_)
# print(knnrsnop.best_params_)

knnnop_pred = knnrsnop.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=knnnop_pred)

0.2854387656702025

In [64]:
# Under PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsnup = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsnup.fit(Xtrain_normal_under_pca, ytrain_normal_under)
# print(knnrsnup.best_score_)
# print(knnrsnup.best_params_)

knnnup_pred = knnrsnup.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=knnnup_pred)

0.3183873398643557

<h3>Train Log Normal Data</h3>

In [28]:
knn = KNeighborsClassifier()

In [66]:
# Xtrain Log Normal
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrsln = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrsln.fit(Xtrain_log_normal, ytrain)
# print(knnrsln.best_score_)
# print(knnrsln.best_params_)

knnln_pred = knnrsln.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=knnln_pred)

0.25164473684210525

In [67]:
# Log Normal Over Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslno = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslno.fit(Xtrain_log_normal_over, ytrain_log_normal_over)
# print(knnrslno.best_score_)
# print(knnrslno.best_params_)

knnlno_pred = knnrslno.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=knnlno_pred)

0.2815639424382297

In [56]:
# Log Normal Under Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [46], 'weights' : ['distance']}
knnrslnu = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslnu.fit(Xtrain_log_normal_under, ytrain_log_normal_under)
# print(knnrslnu.best_score_)
# print(knnrslnu.best_params_)

knnlnu_pred = knnrslnu.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=knnlnu_pred)

0.37111479486116866

In [114]:
# Over PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslnop = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslnop.fit(Xtrain_log_normal_over_pca, ytrain_log_normal_over)
# print(knnrslnop.best_score_)
# print(knnrslnop.best_params_)

knnlnop_pred = knnrslnop.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=knnlnop_pred)

0.2813522355507088

In [69]:
# Under PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslnup = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslnup.fit(Xtrain_log_normal_under_pca, ytrain_log_normal_under)
# print(knnrslnup.best_score_)
# print(knnrslnup.best_params_)

knnlnup_pred = knnrslnup.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=knnlnup_pred)

0.3252185355766519

<h3>Train Clip Normal Data</h3>

In [70]:
knn = KNeighborsClassifier()

In [71]:
# Xtrain Clip Normal
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrscn = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrscn.fit(Xtrain_clip_normal, ytrain)
# print(knnrscn.best_score_)
# print(knnrscn.best_params_)

knncn_pred = knnrscn.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=knncn_pred)

0.2555427820660207

In [72]:
# Normal Clip Over Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrscno = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrscno.fit(Xtrain_clip_normal_over, ytrain_clip_normal_over)
# print(knnrscno.best_score_)
# print(knnrscno.best_params_)

knncno_pred = knnrscno.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=knncno_pred)

0.28957580623578216

In [73]:
# Normal Clip Under Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [53], 'weights' : ['distance']}
knnrscnu = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrscnu.fit(Xtrain_clip_normal_under, ytrain_clip_normal_under)
# print(knnrscnu.best_score_)
# print(knnrscnu.best_params_)

knncnu_pred = knnrscnu.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=knncnu_pred)

0.3797311271975181

In [115]:
# Over PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrscnop = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrscnop.fit(Xtrain_clip_normal_over_pca, ytrain_clip_normal_over)
# print(knnrscnop.best_score_)
# print(knnrscnop.best_params_)

knncnop_pred = knnrscnop.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=knncnop_pred)

0.2898046561412898

In [74]:
# under PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrscnup = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrscnup.fit(Xtrain_clip_normal_under_pca, ytrain_clip_normal_under)
# print(knnrscnup.best_score_)
# print(knnrscnup.best_params_)

knncnup_pred = knnrscnup.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=knncnup_pred)

0.3174663379480372

<h3>Train Log Clip Normal Data</h3>

In [75]:
knn = KNeighborsClassifier()

In [76]:
# Xtrain Log Clip Normal
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslcn = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslcn.fit(Xtrain_log_clip_normal, ytrain)
# print(knnrslcn.best_score_)
# print(knnrslcn.best_params_)

knnlcn_pred = knnrslcn.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=knnlcn_pred)

0.2470762642068852

In [77]:
# Normal Log Clip Over Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslcno = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslcno.fit(Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over)
# print(knnrslcno.best_score_)
# print(knnrslcno.best_params_)

knnlcno_pred = knnrslcno.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=knnlcno_pred)

0.27422652310208534

In [78]:
# Normal Log Clip Under Sampling
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [60], 'weights' : ['distance']}
knnrslcnu = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslcnu.fit(Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under)
# print(knnrslcnu.best_score_)
# print(knnrslcnu.best_params_)

knnlcnu_pred = knnrslcnu.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=knnlcnu_pred)

0.37826131793338147

In [116]:
# Over PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslcnop = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslcnop.fit(Xtrain_log_clip_normal_over_pca, ytrain_log_clip_normal_over)
# print(knnrslcnop.best_score_)
# print(knnrslcnop.best_params_)

knnlcnop_pred = knnrslcnop.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=knnlcnop_pred)

0.27247807017543857

In [80]:
# Under PCA
# space = {'n_neighbors' : range(1, 100), 'weights' : ['uniform','distance']}
space = {'n_neighbors' : [1], 'weights' : ['uniform']}
knnrslcnup = RandomizedSearchCV(estimator=knn, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
knnrslcnup.fit(Xtrain_log_clip_normal_under_pca, ytrain_log_clip_normal_under)
# print(knnrslcnup.best_score_)
# print(knnrslcnup.best_params_)

knnlcnup_pred = knnrslcnup.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=knnlcnup_pred)

0.3131237055168518

<h1>Decision Tree Classifier</h1>

<h4 dir='rtl'>پارامترهای بهینه تمام مدل ها با استفاده از RandomizedSearchCV بدست آمده و با همان پارامترهای بهینه مدلسازی نهایی انجام شده است.</h4>

<h3>Train Data</h3>

In [81]:
dtc = DecisionTreeClassifier()

In [82]:
# Xtrain
# space = {'max_depth': [100, 120, 150],'min_samples_leaf': [2, 5, 10],'criterion': ['gini', 'entropy']}
space = {'max_depth': [120],'min_samples_leaf': [5],'criterion': ['entropy']}
dtcrs = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrs.fit(Xtrain, ytrain)
# print(dtcrs.best_score_)
# print(dtcrs.best_params_)

dtc_pred = dtcrs.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=dtc_pred)

0.242974928487296

In [83]:
# Over Sampling
# space = {'max_depth': range(1, 101, 10),'min_samples_leaf': range(1, 101, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [81],'min_samples_leaf': [21],'criterion': ['gini']}
dtcrso = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrso.fit(Xtrain_over, ytrain_over)
# print(dtcrso.best_score_)
# print(dtcrso.best_params_)

dtco_pred = dtcrso.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=dtco_pred)

0.21614173228346456

In [84]:
# Under Sampling
# space = {'max_depth': range(1, 20),'min_samples_leaf': range(1, 20),'criterion': ['gini', 'entropy']}
space = {'max_depth': [3],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrsu = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsu.fit(Xtrain_under, ytrain_under)
# print(dtcrsu.best_score_)
# print(dtcrsu.best_params_)

dtcu_pred = dtcrsu.best_estimator_.predict(Xtest)
f1_score(y_true=ytest, y_pred=dtcu_pred)

0.37647618292779583

In [117]:
# Over PCA
# space = {'max_depth': range(1, 10),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [3],'min_samples_leaf': [4],'criterion': ['entropy']}
dtcrsop = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsop.fit(Xtrain_over_pca, ytrain_over)
# print(dtcrsop.best_score_)
# print(dtcrsop.best_params_)

dtcop_pred = dtcrsop.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=dtcop_pred)

0.2634599617381798

In [85]:
# Under PCA
# space = {'max_depth': range(1, 10),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [7],'min_samples_leaf': [6],'criterion': ['entropy']}
dtcrsup = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsup.fit(Xtrain_under_pca, ytrain_under)
# print(dtcrsup.best_score_)
# print(dtcrsup.best_params_)

dtcup_pred = dtcrsup.best_estimator_.predict(Xtest_pca)
f1_score(y_true=ytest, y_pred=dtcup_pred)

0.3859416445623342

<h3>Train Normal Data</h3>

In [86]:
dtc = DecisionTreeClassifier()

In [87]:
# Xtrain Normal
# space = {'max_depth': [100, 120, 150],'min_samples_leaf': [2, 5, 10],'criterion': ['gini', 'entropy']}
space = {'max_depth': [120],'min_samples_leaf': [5],'criterion': ['entropy']}
dtcrsn = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrsn.fit(Xtrain_normal, ytrain)
# print(dtcrsn.best_score_)
# print(dtcrsn.best_params_)

dtcn_pred = dtcrsn.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=dtcn_pred)

0.24907438572871088

In [88]:
# Normal Over Sampling
# space = {'max_depth': range(1, 101, 10),'min_samples_leaf': range(1, 101, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [81],'min_samples_leaf': [1],'criterion': ['entropy']}
dtcrsno = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsno.fit(Xtrain_normal_over, ytrain_normal_over)
# print(dtcrsno.best_score_)
# print(dtcrsno.best_params_)

dtcno_pred = dtcrsno.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=dtcno_pred)

0.26593045814057603

In [89]:
# Normal Under Sampling
# space = {'max_depth': range(1, 20),'min_samples_leaf': range(1, 20),'criterion': ['gini', 'entropy']}
space = {'max_depth': [3],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrsnu = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsnu.fit(Xtrain_normal_under, ytrain_normal_under)
# print(dtcrsnu.best_score_)
# print(dtcrsnu.best_params_)

dtcnu_pred = dtcrsnu.best_estimator_.predict(Xtest_normal)
f1_score(y_true=ytest, y_pred=dtcnu_pred)

0.37647618292779583

In [118]:
# Over PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [6],'min_samples_leaf': [3],'criterion': ['gini']}
dtcrsnop = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsnop.fit(Xtrain_normal_over_pca, ytrain_normal_over)
# print(dtcrsnop.best_score_)
# print(dtcrsnop.best_params_)

dtcnop_pred = dtcrsnop.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=dtcnop_pred)

0.333180638265384

In [90]:
# Under PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [6],'min_samples_leaf': [3],'criterion': ['gini']}
dtcrsnup = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrsnup.fit(Xtrain_normal_under_pca, ytrain_normal_under)
# print(dtcrsnup.best_score_)
# print(dtcrsnup.best_params_)

dtcnup_pred = dtcrsnup.best_estimator_.predict(Xtest_normal_pca)
f1_score(y_true=ytest, y_pred=dtcnup_pred)

0.3780403057678944

<h3>Train Log Normal Data</h3>

In [91]:
dtc = DecisionTreeClassifier()

In [92]:
# Xtrain Log Normal
# space = {'max_depth': [40, 20, 60],'min_samples_leaf': [2, 5, 10],'criterion': ['gini', 'entropy']}
space = {'max_depth': [40],'min_samples_leaf': [5],'criterion': ['entropy']}
dtcrsln = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrsln.fit(Xtrain_log_normal, ytrain)
# print(dtcrsln.best_score_)
# print(dtcrsln.best_params_)

dtcln_pred = dtcrsln.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=dtcln_pred)

0.2444032991078943

In [93]:
# Log Normal Over Sampling
# space = {'max_depth': range(1, 101, 10),'min_samples_leaf': range(1, 101, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [41],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrslno = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrslno.fit(Xtrain_log_normal_over, ytrain_log_normal_over)
# print(dtcrslno.best_score_)
# print(dtcrslno.best_params_)

dtclno_pred = dtcrslno.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=dtclno_pred)

0.2647895761569567

In [94]:
# Log Normal Under Sampling
# space = {'max_depth': range(1, 20),'min_samples_leaf': range(1, 20),'criterion': ['gini', 'entropy']}
space = {'max_depth': [1],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrslnu = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrslnu.fit(Xtrain_log_normal_under, ytrain_log_normal_under)
# print(dtcrslnu.best_score_)
# print(dtcrslnu.best_params_)

dtclnu_pred = dtcrslnu.best_estimator_.predict(Xtest_log_normal)
f1_score(y_true=ytest, y_pred=dtclnu_pred)

0.37491277041172366

In [119]:
# Log Normal Over PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [7],'min_samples_leaf': [6],'criterion': ['gini']}
dtcrslnop = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrslnop.fit(Xtrain_log_normal_over_pca, ytrain_log_normal_over)
# print(dtcrslnop.best_score_)
# print(dtcrslnop.best_params_)

dtclnop_pred = dtcrslnop.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=dtclnop_pred)

0.2644661776691117

In [95]:
# Under PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [7],'min_samples_leaf': [6],'criterion': ['gini']}
dtcrslnup = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrslnup.fit(Xtrain_log_normal_under_pca, ytrain_log_normal_under)
# print(dtcrslnup.best_score_)
# print(dtcrslnup.best_params_)

dtclnup_pred = dtcrslnup.best_estimator_.predict(Xtest_log_normal_pca)
f1_score(y_true=ytest, y_pred=dtclnup_pred)

0.37215477996965096

<h3>Train Clip Normal Data</h3>

In [96]:
dtc = DecisionTreeClassifier()

In [97]:
# Xtrain Clip Normal
# space = {'max_depth': [20,40,60,80,100],'min_samples_leaf': [1, 2, 5, 10],'criterion': ['gini', 'entropy']}
space = {'max_depth': [60],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrscn = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrscn.fit(Xtrain_clip_normal, ytrain)
# print(dtcrscn.best_score_)
# print(dtcrscn.best_params_)

dtccn_pred = dtcrscn.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=dtccn_pred)

0.26433761014067086

In [98]:
# Clip Normal Over Sampling
# space = {'max_depth': range(80, 101, 1),'min_samples_leaf': range(1, 101, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [89],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrscno = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=1000, scoring='f1', cv=skf, n_jobs=5)
dtcrscno.fit(Xtrain_clip_normal_over, ytrain_clip_normal_over)
# print(dtcrscno.best_score_)
# print(dtcrscno.best_params_)

dtccno_pred = dtcrscno.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=dtccno_pred)

0.2816778327993009

In [99]:
# Clip Normal Under Sampling
# space = {'max_depth': range(1, 20),'min_samples_leaf': range(1, 20),'criterion': ['gini', 'entropy']}
space = {'max_depth': [3],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrscnu = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrscnu.fit(Xtrain_clip_normal_under, ytrain_clip_normal_under)
# print(dtcrscnu.best_score_)
# print(dtcrscnu.best_params_)

dtccnu_pred = dtcrscnu.best_estimator_.predict(Xtest_clip_normal)
f1_score(y_true=ytest, y_pred=dtccnu_pred)

0.37647618292779583

In [120]:
# Clip Normal Over PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [7],'min_samples_leaf': [6],'criterion': ['entropy']}
dtcrscnop = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrscnop.fit(Xtrain_clip_normal_over_pca, ytrain_clip_normal_over)
# print(dtcrscnop.best_score_)
# print(dtcrscnop.best_params_)

dtccnop_pred = dtcrscnop.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=dtccnop_pred)

0.3338704479535933

In [100]:
# Under PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [7],'min_samples_leaf': [6],'criterion': ['entropy']}
dtcrscnup = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrscnup.fit(Xtrain_clip_normal_under_pca, ytrain_clip_normal_under)
# print(dtcrscnup.best_score_)
# print(dtcrscnup.best_params_)

dtccnup_pred = dtcrscnup.best_estimator_.predict(Xtest_clip_normal_pca)
f1_score(y_true=ytest, y_pred=dtccnup_pred)

0.3816037268312023

<h3>Train Log Clip Normal Data</h3>

In [101]:
dtc = DecisionTreeClassifier()

In [102]:
# Xtrain Log Clip Normal
# space = {'max_depth': [20, 40, 60, 80, 100, 120],'min_samples_leaf': [2, 5, 10],'criterion': ['gini', 'entropy']}
space = {'max_depth': [100],'min_samples_leaf': [5],'criterion': ['entropy']}
dtcrslcn = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrslcn.fit(Xtrain_log_clip_normal, ytrain)
# print(dtcrslcn.best_score_)
# print(dtcrslcn.best_params_)

dtclcn_pred = dtcrslcn.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=dtclcn_pred)

0.2390719569603228

In [103]:
# Normal Log Clip Over Sampling
# space = {'max_depth': range(1, 101, 10),'min_samples_leaf': range(1, 101, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [89],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrslcno = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrslcno.fit(Xtrain_log_clip_normal_over, ytrain_log_clip_normal_over)
# print(dtcrslcno.best_score_)
# print(dtcrslcno.best_params_)

dtclcno_pred = dtcrslcno.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=dtclcno_pred)

0.2706854079118968

In [104]:
# Normal Log Clip Under Sampling
# space = {'max_depth': range(1, 20),'min_samples_leaf': range(1, 20),'criterion': ['gini', 'entropy']}
space = {'max_depth': [3],'min_samples_leaf': [1],'criterion': ['entropy']}
dtcrslcnu = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=100, scoring='f1', cv=skf, n_jobs=5)
dtcrslcnu.fit(Xtrain_log_clip_normal_under, ytrain_log_clip_normal_under)
# print(dtcrslcnu.best_score_)
# print(dtcrslcnu.best_params_)

dtclcnu_pred = dtcrslcnu.best_estimator_.predict(Xtest_log_clip_normal)
f1_score(y_true=ytest, y_pred=dtclcnu_pred)

0.37647618292779583

In [121]:
# Over PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [89],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrslcnop = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrslcnop.fit(Xtrain_log_clip_normal_over_pca, ytrain_log_clip_normal_over)
# print(dtcrslcnop.best_score_)
# print(dtcrslcnop.best_params_)

dtclcnop_pred = dtcrslcnop.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=dtclcnop_pred)

0.27637401855817273

In [105]:
# Under PCA
# space = {'max_depth': range(1, 11),'min_samples_leaf': range(1, 10),'criterion': ['gini', 'entropy']}
space = {'max_depth': [89],'min_samples_leaf': [1],'criterion': ['gini']}
dtcrslcnup = RandomizedSearchCV(estimator=dtc, param_distributions=space, n_iter=200, scoring='f1', cv=skf, n_jobs=5)
dtcrslcnup.fit(Xtrain_log_clip_normal_under_pca, ytrain_log_clip_normal_under)
# print(dtcrslcnup.best_score_)
# print(dtcrslcnup.best_params_)

dtclcnup_pred = dtcrslcnup.best_estimator_.predict(Xtest_log_clip_normal_pca)
f1_score(y_true=ytest, y_pred=dtclcnup_pred)

0.3216175359032502