## Handling Imbalanced DataSet

In [3]:
import numpy as np
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9

n_class0 = int(n_samples * class_0_ratio)
n_class1 = n_samples - n_class0

In [4]:
n_class0, n_class1

(900, 100)

In [5]:
# Creating DataFrame with Imbalanced DataSet
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc = 0, scale = 1, size = n_class0),
    'feature_2': np.random.normal(loc = 0, scale = 1, size = n_class0),
    'target': [0] * n_class0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc = 2, scale = 1, size = n_class1),
    'feature_2': np.random.normal(loc = 2, scale = 1, size = n_class1),
    'target': [1] * n_class1
})

In [6]:
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [9]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

 <b>1. UpSampling</b>

In [10]:
df_min = df[df['target'] == 1]
df_maj = df[df['target'] == 0]

In [15]:
from sklearn.utils import resample

In [17]:
df_min_upsampled = resample(df_min, replace = True,
                n_samples = len(df_maj),
               random_state = 42)

In [18]:
df_upsampled = pd.concat([df_maj, df_min_upsampled])

In [19]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

We can see that the value counts are matching now.

 <b>2. DownSampling</b>

It is the opposite of UpSampling.

The major drawback of DownSampling is that, we will lose a ton of Data Points.