In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [None]:
def lrs(x,y):
    x_train, x_test , y_train , y_test = train_test_split(x,y,test_size = 0.3)
    model = LogisticRegression()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print("Accuracy Score: "+ str(accuracy_score(y_pred,y_test)))
    print(confusion_matrix(y_true=y_test,y_pred=y_pred))
    print("Precision score:" , precision_score(y_pred, y_test))
    print('Recall score:' ,recall_score(y_pred, y_test))
    print('G-mean score:',geometric_mean_score(y_pred, y_test))
    print('Fl score:', f1_score(y_pred, y_test))
    logit_roc_auc=roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds =roc_curve(y_test, model.predict_proba(x_test)[ :,1])
    plt.figure()
    plt.plot(fpr, tpr, label="AUC= %0.2f" % auc(fpr, tpr))
    plt.plot ([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC)")
    plt.legend(loc="lower right")
    plt.show()

In [None]:
train = pd.read_csv("train.csv")
train = train.dropna(axis=0)
x = train.drop(['target'], axis =1).select_dtypes(exclude = ['object'])
y = train.target

Without Sampling

In [None]:
lrs(x,y)

Random Under Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler(random_state = 0)
x_resampled_rus, y_resampled_rus = rus.fit_resample(x,y)
print(sorted(Counter(y_resampled_rus).items()))

In [None]:
lrs(x_resampled_rus,y_resampled_rus)

Tomek Link

In [None]:
from imblearn.under_sampling import TomekLinks
tl=TomekLinks(sampling_strategy= "majority")
x_resanpled_tl, y_resampled_tl = tl.fit_resample(x, y)
print(sorted(Counter(y_resampled_tl).items()))

In [None]:
lrs(x_resanpled_tl,y_resampled_tl)

Edited Nearest Neighbors (ENN)

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(sampling_strategy='majority')
x_resampled_enn, y_resampled_enn = enn.fit_resample(x,y)
print(sorted(Counter(y_resampled_enn).items()))

In [None]:
lrs(x_resampled_enn,y_resampled_enn)

Random Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros=RandomOverSampler(random_state=0)
X_resampled_ros, y_resampled_ros = ros.fit_resample (x, y)
len (X_resampled_ros)

In [None]:
lrs(X_resampled_ros,y_resampled_ros)

Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=42,k_neighbors=5)
X_resampled_sm, y_resampled_sm = sm.fit_resample (x, y)
len (X_resampled_sm)
print(sorted(Counter(y_resampled_sm).items()))

In [None]:
lrs(X_resampled_sm,y_resampled_sm)

Adaptive Synthetic Sampling (ADASYN)

In [None]:
from imblearn.over_sampling import ADASYN
ad=ADASYN(random_state=42,n_neighbors=5)
X_resampled_ad, y_resampled_ad=ad.fit_resample(x, y)
len (X_resampled_ad)
print(sorted(Counter(y_resampled_ad).items()))

In [None]:
lrs(X_resampled_ad,y_resampled_ad)

Hybrid 1

In [None]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
x_resampled_smen, y_resampled_smen = smote_enn.fit_resample(x, y)
print(sorted(Counter(y_resampled_smen).items()))

In [None]:
lrs(x_resampled_smen, y_resampled_smen)

Hybrid 2

In [None]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(random_state=0)
x_resampled_smtm, y_resampled_smtm = smote_tomek.fit_resample(x, y)
print(sorted(Counter(y_resampled_smtm).items()))

In [None]:
lrs(x_resampled_smtm, y_resampled_smtm)