## Handling Imbalanced Dataset

- UpScaling (increase number of datapoints of minority category)
- DownScaling (decrease number of datapoints of majority category)

In [1]:
import pandas as pd
import numpy as np

In [18]:
# Creating imbalanced dataset with 900 Yes and 100 No
class_0 =pd.DataFrame({"feature_1": np.random.normal(loc =0, scale =1, size =900),
                    "feature_2": np.random.normal(loc =0, scale =1, size =900),
                                                  "output": [0]*900})
class_1 =pd.DataFrame({"feature_1": np.random.normal(loc =0, scale =1, size =100),
                    "feature_2": np.random.normal(loc =0, scale =1, size =100),
                                                  "output": [1]*100})

In [19]:
df =pd.concat([class_0, class_1])

In [20]:
df =df.reset_index(drop=True)

In [21]:
df

Unnamed: 0,feature_1,feature_2,output
0,-0.946411,-0.338244,0
1,2.573494,2.068229,0
2,0.567595,-0.014185,0
3,0.608014,-0.450398,0
4,-1.448536,1.046531,0
...,...,...,...
995,-0.402436,0.877370,1
996,-0.640223,0.393521,1
997,1.008489,-0.186941,1
998,-0.451465,0.486222,1


In [24]:
df_minority =df[df["output"]==1]
df_majority =df[df["output"]==0]

## upSampling

In [25]:
from sklearn.utils import resample
df_minority_upsampled =resample(df_minority, replace= True, n_samples=len(df_majority))
df_minority_upsampled

Unnamed: 0,feature_1,feature_2,output
970,0.101685,-2.932326,1
928,-0.718959,0.040947,1
958,0.593964,0.703884,1
961,-0.407447,1.173482,1
940,1.085149,0.030509,1
...,...,...,...
977,0.418722,0.265075,1
995,-0.402436,0.877370,1
981,-0.071524,-0.620216,1
934,-1.568119,-1.681088,1


In [26]:
df_upsampled =pd.concat([df_majority, df_minority_upsampled])
df_upsampled["output"].value_counts()

output
0    900
1    900
Name: count, dtype: int64

## DownScaling

In [27]:
df_majority_downsampled =resample(df_majority, replace= False, n_samples=len(df_minority))
df_majority_downsampled

Unnamed: 0,feature_1,feature_2,output
107,1.072237,-1.631422,0
689,-0.034580,1.323532,0
372,-1.145680,-0.966432,0
506,0.600962,-1.017133,0
775,-0.013469,-0.397567,0
...,...,...,...
888,-0.723748,-0.068082,0
369,-0.535908,0.391210,0
465,1.716079,0.169422,0
593,-0.923660,-0.270930,0


In [28]:
df_downsampled =(pd.concat([df_majority_downsampled, df_minority])).reset_index(drop=True)


In [29]:
df_downsampled

Unnamed: 0,feature_1,feature_2,output
0,1.072237,-1.631422,0
1,-0.034580,1.323532,0
2,-1.145680,-0.966432,0
3,0.600962,-1.017133,0
4,-0.013469,-0.397567,0
...,...,...,...
195,-0.402436,0.877370,1
196,-0.640223,0.393521,1
197,1.008489,-0.186941,1
198,-0.451465,0.486222,1


In [30]:
df_downsampled["output"].value_counts()

output
0    100
1    100
Name: count, dtype: int64