# Handling imbalanced data sets by
## Up sampling
## Down sampling

In [40]:
import numpy as np
import pandas as pd

# Set the seed for reproducibility
np.random.seed(123)

# Create a DataFrame
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [41]:
n_class_0, n_class_1 # (900, 100)

(900, 100)

In [42]:
#Create a dataframe with imbalance classes
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=1, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=1, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [43]:
df = pd.concat([class_0, class_1], axis=0).reset_index(drop=True) # Shuffle the dataframe

In [44]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,0.376371,1.845701,1
996,1.23981,-0.119923,1
997,0.13176,0.640703,1
998,1.902006,-0.609695,1
999,1.69749,1.01357,1


In [45]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [46]:
#up sampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [47]:
from sklearn.utils import resample

In [48]:
df_minority_upsampled = resample(df_minority, replace=True, n_samples= len(df_majority), random_state=42)

In [49]:
df_minority_upsampled.shape

(900, 3)

In [50]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled]).target.value_counts()

In [51]:
#Down sampling 
import numpy as np
import pandas as pd

# Set the seed for reproducibility
np.random.seed(123)

# Create a DataFrame
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

#Create a dataframe with imbalance classes
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=1, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=1, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1], axis=0).reset_index(drop=True) # Shuffle the dataframe

In [52]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]
from sklearn.utils import resample

df_majority_downsampled = resample(df_majority, replace=False, n_samples= len(df_minority), random_state=42)

In [53]:
df_majority_downsampled.shape

(100, 3)

In [54]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority]).target.value_counts()