In [1]:
import pandas as pd
from collections import Counter

data = './Resources/cc_default.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,ID,ln_balance_limit,sex,education,marriage,age,default_next_month
0,1,9.903488,1,2,0,24,1
1,2,11.695247,1,2,1,26,1
2,3,11.407565,1,2,1,34,0
3,4,10.819778,1,2,0,37,0
4,5,10.819778,0,2,0,57,0


In [2]:
x_cols = [i for i in df.columns if i not in ('ID', 'default_next_month')]
X = df[x_cols]
y = df['default_next_month']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 4968, 1: 4968})

In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

from sklearn.metrics import confusion_matrix

pred = model.predict(X_test)
confusion_matrix(y_test, pred)

array([[3732, 2100],
       [ 740,  928]], dtype=int64)

In [14]:
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

print(balanced_accuracy_score(y_test, pred))
print()
print(classification_report_imbalanced(y_test, pred))

0.5981363057701987

                   pre       rec       spe        f1       geo       iba       sup

          0       0.83      0.64      0.56      0.72      0.60      0.36      5832
          1       0.31      0.56      0.64      0.40      0.60      0.35      1668

avg / total       0.72      0.62      0.57      0.65      0.60      0.36      7500



In [15]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)

X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(balanced_accuracy_score(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

[[2814 3018]
 [ 601 1067]]
0.5610992687331616
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.48      0.64      0.61      0.56      0.30      5832
          1       0.26      0.64      0.48      0.37      0.56      0.31      1668

avg / total       0.70      0.52      0.60      0.56      0.56      0.31      7500



In [20]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
XO_resampled, yo_resampled = ros.fit_resample(X_train, y_train)
Counter(yo_resampled)

Counter({0: 17532, 1: 17532})

In [21]:
modelo = LogisticRegression(solver='lbfgs', random_state=1)
modelo.fit(XO_resampled, yo_resampled)

yo_pred = model.predict(X_test)

print(confusion_matrix(y_test, yo_pred))

print(balanced_accuracy_score(y_test, yo_pred))
print(classification_report_imbalanced(y_test, yo_pred))

[[2814 3018]
 [ 601 1067]]
0.5610992687331616
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.48      0.64      0.61      0.56      0.30      5832
          1       0.26      0.64      0.48      0.37      0.56      0.31      1668

avg / total       0.70      0.52      0.60      0.56      0.56      0.31      7500



## SMOTEENN

In [22]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
XS_resampled, ys_resampled = smote_enn.fit_resample(X, y)

In [23]:
models = LogisticRegression(solver='lbfgs', random_state=1)
models.fit(XS_resampled, ys_resampled)

ys_pred = models.predict(X_test)
confusion_matrix(y_test, ys_pred)

array([[4824, 1008],
       [1159,  509]], dtype=int64)

In [24]:
print(balanced_accuracy_score(y_test, ys_pred))
print(classification_report_imbalanced(y_test, ys_pred))

0.5661581845634603
                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.83      0.31      0.82      0.50      0.27      5832
          1       0.34      0.31      0.83      0.32      0.50      0.24      1668

avg / total       0.70      0.71      0.42      0.71      0.50      0.26      7500

