## Prepare train_data and test_data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from hypergbm import make_experiment
from sklearn.metrics import recall_score

In [2]:
train_data = pd.read_csv('datasets/Bank/train.csv.gz')
test_data = pd.read_csv('datasets/Bank/test.csv.gz')
X_train = train_data.copy()
y_train = X_train.pop('deposit')
X_test = test_data.copy()
y_test = X_test.pop('deposit')

# The distribution of y

In [3]:
labels_train = y_train.value_counts()
print('The distribution of y_train\n{}\n\n'.format(labels_train))
labels_test = y_test.value_counts()
print('The distribution of y_test\n{}\n\n'.format(labels_test))

The distribution of y_train
no     31918
yes     4250
Name: deposit, dtype: int64


The distribution of y_test
no     8004
yes    1039
Name: deposit, dtype: int64




# Without class_balancing

In [4]:
experiment = make_experiment(train_data.copy(),target='deposit',
                             random_state=8888,max_trials=10,
                             class_balancing=None)
estimator = experiment.run()

In [5]:
y_pred = estimator.predict(X_test)
score = recall_score(y_test,y_pred,pos_label='yes')
score

0.45813282001924927

# Set class_balancing='ClassWeight'

In [6]:
experiment = make_experiment(train_data.copy(), target='deposit',
                             random_state=8888,max_trials=10,
                              class_balancing='ClassWeight',
                             )
estimator = experiment.run()
y_pred = estimator.predict(X_test)
score = recall_score(y_test,y_pred,pos_label='yes')
score

0.7189605389797883

#  Set class_balancing='RandomOverSampler'

In [7]:
experiment = make_experiment(train_data.copy(), target='deposit',
                             random_state=8888,max_trials=10,
                              class_balancing='RandomOverSampler',
                             )
estimator = experiment.run()
y_pred = estimator.predict(X_test)
score = recall_score(y_test,y_pred,pos_label='yes')
score

0.6862367661212705

#  Set class_balancing='RandomUnderSampler'

In [8]:
experiment = make_experiment(train_data.copy(), target='deposit',
                             random_state=8888,max_trials=10,
                              class_balancing='RandomUnderSampler',
                             )
estimator = experiment.run()
y_pred = estimator.predict(X_test)
score = recall_score(y_test,y_pred,pos_label='yes')
score

0.8931665062560153

#  More choices of class_balancing

In [9]:
#possible values of class_balancing:'SMOTE'↑,'ADASYN'↑,'NearMiss'↓,'TomeksLinks'↓
experiment = make_experiment(train_data.copy(), target='deposit',
                             random_state=8888,max_trials=10,
                             class_balancing='SMOTE',
                             )
estimator = experiment.run()
y_pred = estimator.predict(X_test)
score = recall_score(y_test,y_pred,pos_label='yes')
score

0.7314725697786333