###  # Machine Learning - Handling Imbalanced Data

There are two ways to handle the imbalanced categorical datasets: 

1. Up Sampling - In up sampling, we try to increase the data points of the minority variable.
2. Down Sampling - In down sampling we try to decrease the data points of the majority variable.

In [1]:
import pandas as pd
import numpy as np

In [2]:
n_sample = 1000
n_class_0_ratio = 0.9
n_class_0 = int(n_sample * n_class_0_ratio)
n_class_1 = n_sample - n_class_0

In [3]:
print("n_class_0: " + str(n_class_0))
print("n_class_1: " + str(n_class_1))

n_class_0: 900
n_class_1: 100


In [4]:
class_0 = pd.DataFrame({
    "feature_01": np.random.normal(loc = 0.0, scale = 1.0, size = n_class_0),
    "feature_02": np.random.normal(loc = 0.0, scale = 1.0, size = n_class_0),
    "target": [0] * n_class_0
})

class_1 = pd.DataFrame({
    "feature_01": np.random.normal(loc = 0.0, scale = 1.0, size = n_class_1),
    "feature_02": np.random.normal(loc = 0.0, scale = 1.0, size = n_class_1),
    "target": [1] * n_class_1
})

In [5]:
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [6]:
df.head()

Unnamed: 0,feature_01,feature_02,target
0,0.154499,-1.030767,0
1,-0.54117,0.070549,0
2,0.496094,0.480522,0
3,0.80908,0.857764,0
4,-1.041021,0.331844,0


In [7]:
df.tail()

Unnamed: 0,feature_01,feature_02,target
995,-1.421015,-0.228773,1
996,2.382683,-0.269191,1
997,1.796944,-0.880829,1
998,0.690412,-0.134229,1
999,-0.124257,-2.286632,1


In [8]:
df["target"].value_counts()

0    900
1    100
Name: target, dtype: int64

### Sklearn Library

To perform up sampling or down sampling, we need to import resample from the sklearn.utils library...

In [9]:
from sklearn.utils import resample

In [10]:
df_minority = df[df["target"] == 1].reset_index(drop = True)
df_majority = df[df["target"] == 0].reset_index(drop = True)

In [11]:
df_minority.head()

Unnamed: 0,feature_01,feature_02,target
0,-1.015608,0.777877,1
1,-1.333984,0.098652,1
2,-0.116476,0.118672,1
3,1.070049,-0.003885,1
4,0.769682,-0.038234,1


In [12]:
df_majority.head()

Unnamed: 0,feature_01,feature_02,target
0,0.154499,-1.030767,0
1,-0.54117,0.070549,0
2,0.496094,0.480522,0
3,0.80908,0.857764,0
4,-1.041021,0.331844,0


### Up Sampling

So, let's perform upsampling first...

In [13]:
df_minority_upsample = resample(df_minority, replace = True, n_samples = len(df_majority))

In [14]:
df_minority_upsample.shape

(900, 3)

In [15]:
df_minority_upsample["target"].value_counts()

1    900
Name: target, dtype: int64

In [16]:
df_minority_upsample.reset_index(drop = True, inplace = True)

In [17]:
df_minority_upsample.head()

Unnamed: 0,feature_01,feature_02,target
0,-0.095452,-0.428876,1
1,-0.926339,-1.188365,1
2,0.055439,-0.783842,1
3,1.220709,-0.55083,1
4,0.327246,0.839476,1


In [18]:
df_upsample = pd.concat([df_majority, df_minority_upsample]).reset_index(drop = True)

In [19]:
df_upsample

Unnamed: 0,feature_01,feature_02,target
0,0.154499,-1.030767,0
1,-0.541170,0.070549,0
2,0.496094,0.480522,0
3,0.809080,0.857764,0
4,-1.041021,0.331844,0
...,...,...,...
1795,-0.654049,-1.381190,1
1796,0.339965,1.275953,1
1797,-1.178998,1.942015,1
1798,-0.775594,1.192061,1


In [20]:
df_upsample["target"].value_counts()

0    900
1    900
Name: target, dtype: int64

### Down Sampling

So, now let's perform down sampling...

In [21]:
df_majority_downsample = resample(df_majority, replace = False, n_samples = len(df_minority))

In [22]:
df_majority_downsample.shape

(100, 3)

In [23]:
df_majority_downsample["target"].value_counts()

0    100
Name: target, dtype: int64

In [24]:
df_majority_downsample.reset_index(drop = True, inplace = True)

In [25]:
df_majority_downsample.head()

Unnamed: 0,feature_01,feature_02,target
0,-1.179466,-0.327552,0
1,0.535986,-1.039961,0
2,-0.302967,0.125612,0
3,-0.271478,-0.307408,0
4,-0.209452,-0.465656,0


In [26]:
df_downsample = pd.concat([df_majority_downsample, df_minority]).reset_index(drop = True)

In [27]:
df_downsample

Unnamed: 0,feature_01,feature_02,target
0,-1.179466,-0.327552,0
1,0.535986,-1.039961,0
2,-0.302967,0.125612,0
3,-0.271478,-0.307408,0
4,-0.209452,-0.465656,0
...,...,...,...
195,-1.421015,-0.228773,1
196,2.382683,-0.269191,1
197,1.796944,-0.880829,1
198,0.690412,-0.134229,1


In [28]:
df_downsample["target"].value_counts()

0    100
1    100
Name: target, dtype: int64