### **Handling Imbalanced Dataset**

In [11]:
import numpy as np
import pandas as pd
from sklearn.utils import resample

In [2]:
np.random.seed(123)

# Create dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [3]:
n_class_0, n_class_1

(900, 100)

In [4]:
### Create dataframe with imbalanced dataset

class_0 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
    "target": [0] * n_class_0
})

class_1 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_1),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_1),
    "target": [1] * n_class_1
})

In [5]:
print(class_0.head(), end="\n\n")
print(class_1.head())


   feature_1  feature_2  target
0  -1.085631   0.551302       0
1   0.997345   0.419589       0
2   0.282978   1.815652       0
3  -1.506295  -0.252750       0
4  -0.578600  -0.292004       0

   feature_1  feature_2  target
0  -0.300232   0.139033       1
1  -0.632261   0.025577       1
2  -0.204317  -0.196443       1
3   0.213696   1.312255       1
4   1.033878   1.187417       1


In [6]:
data = pd.concat([class_0, class_1]).reset_index(drop=True)

In [8]:
data.head()
data.shape

(1000, 3)

In [9]:
data["target"].value_counts()

target
0    900
1    100
Name: count, dtype: int64

#### **Upsampling**

In [10]:
data_major = data[data["target"] == 0]
data_minor = data[data["target"] == 1]

In [12]:
data_minor_upsampled = resample(data_minor, 
                                replace=True,
                                n_samples=len(data_major),
                                random_state=42,
                                )

In [15]:
data_minor_upsampled.head()
data_minor_upsampled.shape


(900, 3)

In [18]:
data_upsampled = pd.concat([data_major, data_minor_upsampled])

data_upsampled.shape

(1800, 3)

In [19]:
data_upsampled["target"].value_counts()

target
0    900
1    900
Name: count, dtype: int64

#### **Down Sampling**

In [20]:
np.random.seed(123)

# Create dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [21]:
### Create dataframe with imbalanced dataset

class_0 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
    "target": [0] * n_class_0
})

class_1 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_1),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_1),
    "target": [1] * n_class_1
})

In [22]:
data_major = data[data["target"] == 0]
data_minor = data[data["target"] == 1]

In [23]:
data_major_downsampled = resample(data_major, 
                                  replace=True,
                                  n_samples=len(data_minor),
                                  random_state=42,
                                  )

In [26]:
data_major_downsampled.head()
data_major_downsampled.shape

(100, 3)

In [27]:
data_downsampled = pd.concat([data_major_downsampled, data_minor])

data_downsampled.shape

(200, 3)

In [28]:
data_downsampled["target"].value_counts()

target
0    100
1    100
Name: count, dtype: int64