In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE
from sklearn.datasets import load_breast_cancer

In [3]:
#Loading data
data = load_breast_cancer()

In [12]:
#Splitting the data into X & y
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=['target'])
X.shape, y.shape

((569, 30), (569, 1))

In [16]:
#Checking count
y.value_counts()

target
1         357
0         212
dtype: int64

In [17]:
#Checking event rate
y.value_counts()/y.shape[0]*100

target
1         62.741652
0         37.258348
dtype: float64

In [23]:
#Creating a dataframe which will capture the count of X & y with different techniques
data_table = pd.DataFrame()
data_table['technique'] = ['Original Data']
data_table['X_shape'] = [X.shape[0]]
data_table['y_shape'] = [y.shape[0]]
data_table['target_0']=[y.value_counts()[0]]
data_table['target_1']=[y.value_counts()[1]]
data_table

Unnamed: 0,technique,X_shape,y_shape,target_0,target_1
0,Original Data,569,569,target 0 212 dtype: int64,target 1 357 dtype: int64


# SMOTE

In [20]:
#Performing SMOTE
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(X,y)

In [21]:
#Checking the shape of the data
X_sm.shape, y_sm.shape
#573518*2

((714, 30), (714, 1))

In [24]:
#Updating data table
new_row2 = {'technique':'SMOTE',
 'X_shape':X_sm.shape[0],
 'y_shape':y_sm.shape[0],
 'target_0': y_sm.value_counts()[0],
 'target_1': y_sm.value_counts()[1]}
data_table=data_table.append(new_row2, ignore_index=True)
data_table

Unnamed: 0,technique,X_shape,y_shape,target_0,target_1
0,Original Data,569,569,target 0 212 dtype: int64,target 1 357 dtype: int64
1,SMOTE,714,714,target 0 357 dtype: int64,target 1 357 dtype: int64


In [27]:
#Creating Over Sampling with different SMOTE techniques
smote = SMOTE()
#smotetenc = SMOTENC()
b_smote =BorderlineSMOTE()
svm_smote =SVMSMOTE()
adasyn = ADASYN()
kmean_smote = KMeansSMOTE()

for model in [smote,b_smote,svm_smote,adasyn,kmean_smote]:
    x_s,y_s=model.fit_sample(X,y)
    print(model, x_s.shape, y_s.shape)

SMOTE() (714, 30) (714, 1)
BorderlineSMOTE() (714, 30) (714, 1)
SVMSMOTE() (714, 30) (714, 1)
ADASYN() (715, 30) (715, 1)
KMeansSMOTE() (717, 30) (717, 1)
