# Working_with_Imbalanced_Data

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np

data = fetch_openml('mammography')
x,y = data.data,data.target
y = (y.astype(np.int)+1)//2
x.shape

In [35]:
np.bincount(y)

array([10923,   260])

In [36]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y,random_state=0)

# baseline

In [48]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

scores = cross_validate(LogisticRegression(),
                       x_train,y_train,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.92, 0.629)

In [49]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_validate(RandomForestClassifier(n_estimators=100),
                       x_train,y_train,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.948, 0.726)

# Random Undersampling

In [39]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(replacement=False)
x_train_subsample,y_train_subsample = rus.fit_sample(x_train,y_train)
print(x_train.shape)
print(x_train_subsample.shape)
print(np.bincount(y_train_subsample))

(8387, 6)
(390, 6)
[195 195]


In [59]:
scores = cross_validate(LogisticRegression(),
                       x_train_subsample,y_train_subsample,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)
# baseline was 0.92, 0.629

(0.921, 0.933)

In [58]:
scores = cross_validate(RandomForestClassifier(n_estimators=100),
                       x_train_subsample,y_train_subsample,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)
# baseline was 0.948, 0.726

(0.955, 0.965)

# Random Oversampling

In [21]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
x_train_oversample,y_train_oversample = ros.fit_sample(x_train,y_train)
print(x_train.shape)
print(x_train_oversample.shape)
print(np.bincount(y_train_oversample))

(8387, 6)
(16384, 6)
[8192 8192]


In [61]:
scores = cross_validate(LogisticRegression(),
                       x_train_oversample,y_train_oversample,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)
# baseline was 0.92, 0.629

(0.92, 0.937)

In [63]:
scores = cross_validate(RandomForestClassifier(n_estimators=100),
                       x_train_oversample,y_train_oversample,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)
# baseline was 0.948, 0.726

(0.994, 0.992)

# Class-Weights Method

In [67]:
scores = cross_validate(LogisticRegression(class_weight='balanced'),
                       x_train,y_train,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.918, 0.587)

In [70]:
scores = cross_validate(RandomForestClassifier(n_estimators=100,
                                              class_weight='balanced'),
                       x_train,y_train,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.921, 0.704)

# Easy Ensemble with imblearn

In [71]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier

#from imblearn.ensemble import BalancedRandomForestClassifier
#resampled_rf = BalancedRandomForestClassifier()

tree = DecisionTreeClassifier(max_features='auto')
resample_rf = BalancedBaggingClassifier(base_estimator=tree,
                                       n_estimators=100,random_state=0)

scores = cross_validate(resample_rf,
                       x_train,y_train,cv=10,
                       scoring=('roc_auc','average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.955, 0.645)

# Edited Nearest Neighbours

In [73]:
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(n_neighbors=5)
x_train_enn,y_train_enn = enn.fit_sample(x_train,y_train)

In [74]:
print(x_train.shape)
print(x_train_enn.shape)
print(np.bincount(y_train_enn))

(8387, 6)
(8146, 6)
[7951  195]


In [77]:
from imblearn.under_sampling import EditedNearestNeighbours
enn_mode = EditedNearestNeighbours(kind_sel='mode',n_neighbors=5)
x_train_enn_mode,y_train_enn_mode = enn_mode.fit_sample(x_train,y_train)

In [80]:
print(x_train.shape)
print(x_train_enn_mode.shape)
print(np.bincount(y_train_enn_mode))

(8387, 6)
(8356, 6)
[8161  195]


In [82]:
enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5),
                             LogisticRegression())
scores = cross_validate(enn_pipe, x_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.92, 0.627)

In [83]:
enn_pipe_rf = make_imb_pipeline(EditedNearestNeighbours(n_neighbors= 5),
                                RandomForestClassifier(n_estimators=100))
scores = cross_validate(enn_pipe_rf, x_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.943, 0.696)

# SMOTE

In [88]:
from imblearn.over_sampling import SMOTE
smote_pipe = make_imb_pipeline(SMOTE(), LogisticRegression())
scores = cross_validate(smote_pipe, x_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.919, 0.59)

In [92]:
from imblearn.over_sampling import SMOTE
smote_pipe_rf = make_imb_pipeline(SMOTE(), RandomForestClassifier(n_estimators=100))
scores = cross_validate(smote_pipe, x_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
round(scores['test_roc_auc'].mean(),3),round(scores['test_average_precision'].mean(),3)

(0.949, 0.692)

# Summary

### 1.Always check roc_auc an AP, look at curves

### 2.Undersampling is very fast and can help!

### 3.Undersampling+Ensembles worth a try.

### 4.Many smart sampling strategies,mixed outcomes

### 5.SMOTE allows adding new interpolated samples

### 6.Mixed outcomes with SMOTE,also definition a bit unclear