Classification --> Supervised ML

Output --> Categorical feature

2 Categories ---> Yes/No

1000 Datapoints in a dataset

900 Yes and 100 No

900 : 100 --> 9 : 1 [Imbalanced Dataset]

500 : 500 --> 1 : 1 [Balanced Dataset]

600 : 400 --> 6 : 4 [Balanced Dataset]

700 : 300 --> 7 : 3 [Can be imbalanced or balanced dataset according to the situation]

900 Yes and 100 No (Dumb Model) 90% accuracy

There are two techniques to handle the imbalanced dataset. They are :
- Upsampling,
- Downsampling

In [1]:
import numpy as np # import numpy with alias np
import pandas as pd # import pandas with alias pd

In [2]:
# Create a DataFrame with two classes
n_samples = 1000 # total number of samples
class_0_ratio = 0.9 # ratio of class 0 samples
n_class_0 = int(n_samples * class_0_ratio) # number of class 0 samples
n_class_1 = n_samples - n_class_0 # number of class 1 samples

In [3]:
n_class_0, n_class_1 # print number of samples in each class

(900, 100)

In [5]:
class_0 = pd.DataFrame({  # create a DataFrame for class 0
    'feature_1': np.random.normal(0, 1, n_class_0), # create a column with normally distributed random numbers
    'feature_2': np.random.normal(0, 1, n_class_0), # create a column with normally distributed random numbers
    'target': np.zeros(n_class_0) # create a column with zeros
})

class_1 = pd.DataFrame({ # create a DataFrame for class 1
    'feature_1': np.random.normal(2, 1, n_class_1), # create a column with normally distributed random numbers
    'feature_2': np.random.normal(2, 1, n_class_1), # create a column with normally distributed random numbers
    'target': np.ones(n_class_1) # create a column with ones
})

In [6]:
class_0

Unnamed: 0,feature_1,feature_2,target
0,0.942390,-1.021770,0.0
1,0.768230,-1.041758,0.0
2,0.563683,0.017475,0.0
3,0.416299,1.220931,0.0
4,0.855431,-0.565060,0.0
...,...,...,...
895,-0.387227,-0.290256,0.0
896,2.495781,-0.896558,0.0
897,-0.196688,1.239618,0.0
898,0.438073,-0.657172,0.0


In [7]:
class_1

Unnamed: 0,feature_1,feature_2,target
0,1.870976,2.178982,1.0
1,2.941922,1.633209,1.0
2,3.465323,4.028808,1.0
3,1.802537,1.715019,1.0
4,2.606540,1.937554,1.0
...,...,...,...
95,0.732410,2.143346,1.0
96,3.601863,2.092956,1.0
97,1.303944,2.565438,1.0
98,2.362654,1.960148,1.0


In [8]:
class_0.shape, class_1.shape # print the shape of each DataFrame

((900, 3), (100, 3))

In [9]:
df = pd.concat([class_0, class_1]).reset_index(drop=True) # concatenate the two DataFrames and reset the index

In [10]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.94239,-1.02177,0.0
1,0.76823,-1.041758,0.0
2,0.563683,0.017475,0.0
3,0.416299,1.220931,0.0
4,0.855431,-0.56506,0.0


In [11]:
df['target'] = df['target'].astype(int)  # convert the target column to integer

In [12]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,0.94239,-1.02177,0
1,0.76823,-1.041758,0
2,0.563683,0.017475,0
3,0.416299,1.220931,0
4,0.855431,-0.56506,0


In [13]:
df['target'].value_counts() # count the number of samples in each class

target
0    900
1    100
Name: count, dtype: int64

Upsampling

In [14]:
df_minority = df[df['target'] == 1] # select the minority class
df_majority = df[df['target'] == 0] # select the majority class

In [17]:
df_majority.shape, df_minority.shape # print the shape of each class

((900, 3), (100, 3))

In [1]:
len(df_majority)

NameError: name 'df_majority' is not defined

In [None]:
# Upsampling the minority class
from sklearn.utils import resample # import resample from sklearn.utils
df_minority_upsampled = resample(df_minority, # resample the minority class
                                 replace=True , # sample with replacement create additional samples
                                 n_samples=len(df_majority), # match the number of samples in the majority class
)

In [None]:
df_minority_upsampled.shape # print the shape of the upsampled minority class

(900, 3)

In [None]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
940,4.364528,3.23632,1
984,1.21394,4.050079,1
955,3.446997,-0.377332,1
955,3.446997,-0.377332,1
928,3.171312,1.151545,1


In [None]:
df_minority_upsampled['target'].value_counts() # count the number of samples 

target
1    900
Name: count, dtype: int64

In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled]) # concatenate the majority class and the upsampled minority class

In [None]:
df_upsampled['target'].value_counts() # count the number of samples in each class

target
0    900
1    900
Name: count, dtype: int64

In [None]:
df_upsampled.shape

(1800, 3)

In [None]:
df_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
0,0.94239,-1.02177,0
1,0.76823,-1.041758,0
2,0.563683,0.017475,0
3,0.416299,1.220931,0
4,0.855431,-0.56506,0


Downsampling

In [None]:
class_0 = pd.DataFrame({  # create a DataFrame for class 0
    'feature_1': np.random.normal(0, 1, n_class_0), # create a column with normally distributed random numbers
    'feature_2': np.random.normal(0, 1, n_class_0), # create a column with normally distributed random numbers
    'target': np.zeros(n_class_0) # create a column with zeros
})

class_1 = pd.DataFrame({ # create a DataFrame for class 1
    'feature_1': np.random.normal(2, 1, n_class_1), # create a column with normally distributed random numbers
    'feature_2': np.random.normal(2, 1, n_class_1), # create a column with normally distributed random numbers
    'target': np.ones(n_class_1) # create a column with ones
})

In [None]:
df = pd.concat([class_0, class_1]).reset_index(drop=True) # concatenate the two DataFrames and reset the index

In [None]:
df_minority = df[df['target'] == 1] # select the minority class
df_majority = df[df['target'] == 0] # select the majority class

In [None]:
df_majority_downsampled = resample(df_majority, # resample the majority class
                                   replace=False, # sample without replacement
                                   n_samples=len(df_minority), # match the number of samples in the minority class
    )

In [None]:
df_majority_downsampled.shape

(100, 3)

In [None]:
df_downsampled = pd.concat([df_minority, df_majority_downsampled]) # concatenate the minority class and the downsampled majority class

In [None]:
df_downsampled['target'].value_counts() # count the number of samples in each class 

target
1.0    100
0.0    100
Name: count, dtype: int64