#### Handling Imbalanced Data
* Up Sampling 
* Down 

In [1]:
import numpy as np
import pandas as pd
np.random.seed(123)

n_samples=1000
class_0_ratio=0.9
n_class_0=int(class_0_ratio*n_samples)
n_class_1=n_samples- n_class_0

In [2]:
n_class_0, n_class_1

(900, 100)

#### A dataset is imbalanced when the distribution of target classes is skewed 

In [3]:
import pandas as pd
class_0=pd.DataFrame({
    'feature1': np.random.normal(loc=0, scale=1, size=n_class_0), 
    'feature2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0]*n_class_0})
class_1=pd.DataFrame({
    'feature1': np.random.normal(loc=0, scale=1, size=n_class_1), 
    'feature2': np.random.normal(loc=0, scale=1, size=n_class_1),
    'target': [1]*n_class_1})
print(class_0)
print(class_1)

     feature1  feature2  target
0   -1.085631  0.551302       0
1    0.997345  0.419589       0
2    0.282978  1.815652       0
3   -1.506295 -0.252750       0
4   -0.578600 -0.292004       0
..        ...       ...     ...
895  0.238761 -0.003155       0
896 -1.106386 -0.430660       0
897  0.366732 -0.146416       0
898  1.023906  1.160176       0
899 -0.210056 -0.641512       0

[900 rows x 3 columns]
    feature1  feature2  target
0  -0.300232  0.139033       1
1  -0.632261  0.025577       1
2  -0.204317 -0.196443       1
3   0.213696  1.312255       1
4   1.033878  1.187417       1
..       ...       ...     ...
95 -0.623629  0.845701       1
96  0.239810 -1.119923       1
97 -0.868240 -0.359297       1
98  0.902006 -1.609695       1
99  0.697490  0.013570       1

[100 rows x 3 columns]


In [4]:
df= pd.concat([class_0,class_1]).reset_index(drop=True)
print(df)

     feature1  feature2  target
0   -1.085631  0.551302       0
1    0.997345  0.419589       0
2    0.282978  1.815652       0
3   -1.506295 -0.252750       0
4   -0.578600 -0.292004       0
..        ...       ...     ...
995 -0.623629  0.845701       1
996  0.239810 -1.119923       1
997 -0.868240 -0.359297       1
998  0.902006 -1.609695       1
999  0.697490  0.013570       1

[1000 rows x 3 columns]


In [5]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

#### UP SAMPLING

In [13]:
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]


In [8]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
print(df_minority)
df_minority_upsampled.head()

     feature1  feature2  target
900 -0.300232  0.139033       1
901 -0.632261  0.025577       1
902 -0.204317 -0.196443       1
903  0.213696  1.312255       1
904  1.033878  1.187417       1
..        ...       ...     ...
995 -0.623629  0.845701       1
996  0.239810 -1.119923       1
997 -0.868240 -0.359297       1
998  0.902006 -1.609695       1
999  0.697490  0.013570       1

[100 rows x 3 columns]


Unnamed: 0,feature1,feature2,target
951,-0.874146,-0.156083,1
992,0.19657,-0.602575,1
914,-0.06783,0.998053,1
971,0.272825,1.034197,1
960,0.870056,-0.449515,1


In [12]:
df_upsampled=pd.concat([df_minority_upsampled,df_majority]).reset_index(drop=True)
df_upsampled['target'].value_counts()

target
1    900
0    900
Name: count, dtype: int64

In [15]:
df_majority_downsampled=resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
df_downsampled=pd.concat([df_majority_downsampled,df_minority], ignore_index=True)
print(df_downsampled['target'].value_counts())


target
0    100
1    100
Name: count, dtype: int64
