## Handling Imbalanced Dataset
### 1 . up sampling
### 2. down sampling

In [None]:
import numpy as np
import  pandas as pd
from pygments.unistring import xid_continue

np.random.seed(123)

n_sample = 1000
class_0_ratio = 0.9
n_class_0 = int(n_sample * class_0_ratio)
n_class_1 = n_sample - n_class_0

In [None]:
n_class_0,n_class_1

## CREATE MY DATAFRAME WITH IMBALANCED DATASET


In [None]:
class_0 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0,scale=1,size=n_class_0),
    "feature_2": np.random.normal(loc=0,scale=1,size=n_class_0),
    "target": [0] * n_class_0
})
class_1 = pd.DataFrame({
    "feature_1": np.random.normal(loc=2,scale=1,size=n_class_1),
    "feature_2": np.random.normal(loc=2,scale=1,size=n_class_1),
    "target": [1] * n_class_1
})

In [None]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df["target"].value_counts()

In [None]:
## upsampling
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [None]:
from sklearn.utils import resample
df_minority_resample = resample(df_minority,replace=True,n_samples= len(df_majority),random_state=42)

In [None]:
df_minority_resample.head()

In [None]:
pd.concat([df_minority_resample,df_majority])

In [None]:
df_minority_resample["target"].value_counts()

In [None]:
## downsample
import numpy as np

np.random.seed(123)
n_sample = 1000
class_0_ratio = 0.9
n_class_0 = int(n_sample * class_0_ratio)
n_class_1 = n_sample - n_class_0

In [None]:
df["target"].value_counts()

In [None]:
class_0 = pd.DataFrame({
    "feature1": np.random.normal(loc=0,scale=1,size=n_class_0),
    "feature2": np.random.normal(loc=0,scale=1,size=n_class_0),
    "target": [0] * n_class_0
})
class_1 = pd.DataFrame({
    "feature1": np.random.normal(loc=2,scale=1,size=n_class_1),
    "feature2": np.random.normal(loc=2,scale=1,size=n_class_1),
    "target": [1] * n_class_1
})

In [None]:
df_minority = df[df["target"] == 1]
df_majority = df[df["target"] == 0]

In [None]:
from sklearn.utils import resample

df_minority_resample_down = resample(df_majority,replace=True,n_samples= len(df_minority),random_state=42)

In [None]:
df_minority_resample_down["target"].value_counts()

## SMOT(Synthetic minority oversampling technique)

In [None]:
from sklearn.datasets import make_classification

In [None]:
x,y=make_classification(n_samples=1000,
                    n_redundant=0,
                    n_features=2,
                    n_clusters_per_class=1,
                    weights=[0.90],
                    random_state=12,
                    )

In [None]:
import pandas as pd
df1 = pd.DataFrame(x,columns=['f1','f2'])
df2 = pd.DataFrame(y,columns=['target'])
final_df = pd.concat([df1,df2],axis=1)

In [None]:
final_df.head()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(final_df['f1'],final_df['f2'],c=final_df['target'])

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample = SMOTE()
x,y = oversample.fit_resample(final_df[['f1','f2']],final_df['target'])

In [None]:
x.shape

In [None]:
y.shape

In [None]:
df1 = pd.DataFrame(x,columns=['f1','f2'])
df2 = pd.DataFrame(y,columns=['target'])
oversample_df = pd.concat([df1,df2],axis=1)

In [None]:
plt.scatter(oversample_df['f1'],oversample_df['f2'],c=oversample_df['target'])