In [12]:
import pandas as pd
from sklearn.utils import resample
df=pd.DataFrame({
    'Age':[22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class':['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']
})
df['Class'].value_counts()

Class
High    7
Low     6
Name: count, dtype: int64

In [13]:
#Seperate majority and minority classes
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']

In [6]:
#Downsample majority class
df_high_downsampled=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)

In [8]:
#combine downsampled majority with minority class
df_balanced=pd.concat([df_high_downsampled,df_low])

In [9]:
print(df_balanced['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


In [14]:
#Upsample minority class
df_low_upsampled=resample(df_low,replace=True,n_samples=len(df_high),random_state=42)

In [15]:
df_balanced=pd.concat([df_low_upsampled,df_high])
print(df_balanced['Class'].value_counts())

Class
Low     7
High    7
Name: count, dtype: int64


In [17]:
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority',  'Majority', 'Majority', 'Majority', 'Majority']
})
df['Class'].value_counts()

#separate majority and minority classes
df_majority = df[df['Class'] == 'Majority']
df_minority = df[df['Class'] == 'Minority']

#Downsample majority class
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

#combine downsampled majority with the minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])
print(df_balanced['Class'].value_counts())

#Upsample minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

#combine lowsampled minority with the majority class
df_balanced = pd.concat([df_minority_upsampled, df_majority])
print(df_balanced['Class'].value_counts())


Class
Majority    4
Minority    4
Name: count, dtype: int64
Class
Minority    9
Majority    9
Name: count, dtype: int64


In [18]:
pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


**1.SMOTE(synthetic minority over-sampling techniqe) to generate synthetic samples instead of duplicating existing ones**

2.Convert categorical class labels into numeric from for SMOTE to work.

3.Apply SMOTE to balance the dataset .

4.Convert back to original categorical labels.

5.Combine the resampled data into a final balanced dataset.

In [22]:
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority',  'Majority', 'Majority', 'Majority', 'Majority']
})        
df['Class']=df['Class'].map({'Majority':0,'Minority':1})
X=df[['Age','Income']]
Y=df['Class']
smote=SMOTE(sampling_strategy='auto',random_state=42,k_neighbors=3)
x_resampled,y_resampled=smote.fit_resample(X,Y)
y_resampled,y_resampled.map({0:'Majority',1:'Minority'})
df_balanced=pd.concat([pd.DataFrame(x_resampled,columns=['Age','Income']),pd.DataFrame(y_resampled,columns=['Class'])],axis=1)
print(df_balanced['Class'].value_counts())
print(df_balanced)



Class
1    9
0    9
Name: count, dtype: int64
    Age  Income  Class
0    22    2000      1
1    25    2500      0
2    27    2700      0
3    28    3200      0
4    30    3500      0
5    35    3800      1
6    40    4000      1
7    45    4200      1
8    50    4300      0
9    55    4500      0
10   60    5000      0
11   65    5500      0
12   70    6000      0
13   40    4031      1
14   35    3831      1
15   44    4176      1
16   35    3826      1
17   41    4040      1
