## Data level approaches

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from sklearn.metrics import roc_auc_score as roc_auc, accuracy_score as acc
from setup import rfc, wrangle_data, split_data

seed = 42

**Data wrangling**

In [2]:
train = pd.read_csv('./train_ZoGVYWq.csv')
train = wrangle_data(train)

print('Unbalanced ratio: {}:1'.format(int(round(len(train[train.renewal==1])/len(train[train.renewal==0])))))

Unbalanced ratio: 15:1


**Train and validation split**

In [3]:
train_X, validation_X, train_Y, validation_Y = split_data(train, segmented=True)

**Baseline**

In [4]:
rfc(train_X, train_Y, validation_X, validation_Y)

ROC-AUC: 0.56


**Over and undersampling techniques**

<br>
SMOTE (oversampling)

In [5]:
sm = SMOTE(random_state=seed, ratio='minority')
train_X_res, train_Y_res = sm.fit_sample(train_X, train_Y)


print("SMOTE + RandomForestClassifier")
print('Unbalanced ratio: {}:1'.format(int(round(len(train_Y_res[train_Y_res==1])/len(train_Y_res[train_Y_res==0])))))
rfc(train_X_res, train_Y_res, validation_X, validation_Y)

SMOTE + RandomForestClassifier
Unbalanced ratio: 1:1
ROC-AUC: 0.61


<br>
SMOTE (oversampling) + Tomek Links (undersampling)

In [6]:
sm = SMOTETomek(random_state=seed)
train_X_res, train_Y_res = sm.fit_sample(train_X, train_Y)

print("SMOTETomek + RandomForestClassifier")
print('Unbalanced ratio: {}:1'.format(int(round(len(train_Y_res[train_Y_res==1])/len(train_Y_res[train_Y_res==0])))))
rfc(train_X_res, train_Y_res, validation_X, validation_Y)

SMOTETomek + RandomForestClassifier
Unbalanced ratio: 1:1
ROC-AUC: 0.61


<br>
ADASYN (oversampling)

In [7]:
ad = ADASYN(random_state=seed)
train_X_res, train_Y_res = ad.fit_sample(train_X, train_Y)

print("ADASYN + RandomForestClassifier")
print('Unbalanced ratio: {}:1'.format(int(round(len(train_Y_res[train_Y_res==1])/len(train_Y_res[train_Y_res==0])))))
rfc(train_X_res, train_Y_res, validation_X, validation_Y)

ADASYN + RandomForestClassifier
Unbalanced ratio: 1:1
ROC-AUC: 0.61


PS: It's very important to create synthetic samples only in the training set, in other words, after the split