In [12]:
import pandas as pd
from sklearn.utils import resample
df = pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['high', 'low', 'high', 'high', 'low', 'low', 'high', 'low', 'high', 'high', 'low', 'high']
})
print(df)

    age  income class
0    22    2000  high
1    25    2500   low
2    28    2700  high
3    30    3200  high
4    35    3500   low
5    40    3800   low
6    45    4000  high
7    50    4200   low
8    55    4300  high
9    60    4500  high
10   65    5000   low
11   70    5500  high


In [13]:
df_high=df[df['class']=='high']
df_low=df[df['class']=='low']
print(df_high)
print(df_low)

    age  income class
0    22    2000  high
2    28    2700  high
3    30    3200  high
6    45    4000  high
8    55    4300  high
9    60    4500  high
11   70    5500  high
    age  income class
1    25    2500   low
4    35    3500   low
5    40    3800   low
7    50    4200   low
10   65    5000   low


In [14]:
df_high_downsample=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)
df_high_downsample

Unnamed: 0,age,income,class
0,22,2000,high
2,28,2700,high
9,60,4500,high
3,30,3200,high
8,55,4300,high


In [15]:
df_balanced=pd.concat([df_high_downsample,df_low])
df_balanced

Unnamed: 0,age,income,class
0,22,2000,high
2,28,2700,high
9,60,4500,high
3,30,3200,high
8,55,4300,high
1,25,2500,low
4,35,3500,low
5,40,3800,low
7,50,4200,low
10,65,5000,low


In [16]:
print(df_balanced['class'].value_counts())

class
high    5
low     5
Name: count, dtype: int64


**Upsampling the minority class**

In [17]:
import pandas as pd
from sklearn.utils import resample
df = pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['minority', 'majority', 'majority', 'majority', 'majority', 'minority', 'minority', 'majority', 'majority', 'minority', 'minority', 'majority']
})
print(df)

    age  income     class
0    22    2000  minority
1    25    2500  majority
2    28    2700  majority
3    30    3200  majority
4    35    3500  majority
5    40    3800  minority
6    45    4000  minority
7    50    4200  majority
8    55    4300  majority
9    60    4500  minority
10   65    5000  minority
11   70    5500  majority


In [18]:
df_majority=df[df['class']=='majority']
df_minority=df[df['class']=='minority']
print(df_majority)
print(df_minority)

    age  income     class
1    25    2500  majority
2    28    2700  majority
3    30    3200  majority
4    35    3500  majority
7    50    4200  majority
8    55    4300  majority
11   70    5500  majority
    age  income     class
0    22    2000  minority
5    40    3800  minority
6    45    4000  minority
9    60    4500  minority
10   65    5000  minority


In [19]:
df_minority_upsampled = resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)
df_minority_upsampled

Unnamed: 0,age,income,class
9,60,4500,minority
10,65,5000,minority
6,45,4000,minority
10,65,5000,minority
10,65,5000,minority
5,40,3800,minority
6,45,4000,minority


In [20]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled 

Unnamed: 0,age,income,class
1,25,2500,majority
2,28,2700,majority
3,30,3200,majority
4,35,3500,majority
7,50,4200,majority
8,55,4300,majority
11,70,5500,majority
9,60,4500,minority
10,65,5000,minority
6,45,4000,minority


In [21]:
print(df_upsampled['class'].value_counts())

class
majority    7
minority    7
Name: count, dtype: int64


In [1]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Create the DataFrame
df = pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['minority', 'majority', 'majority', 'majority', 'majority', 'minority', 'minority', 'majority', 'majority', 'minority', 'minority', 'majority']
})

# Correct the typo in class mapping ('miniority' -> 'minority')
df['class_label'] = df['class'].map({'majority': 0, 'minority': 1})

# Features and target
X = df[['age', 'income']]
y = df['class_label']

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled data into a new DataFrame
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['age', 'income']),pd.DataFrame(y_resampled, columns=['class_label'])], axis=1)

# Print the class distribution in the resampled dataset
print(df_balanced['class_label'].value_counts())

# Print the balanced DataFrame
print(df_balanced)


class_label
1    7
0    7
Name: count, dtype: int64
    age  income  class_label
0    22    2000            1
1    25    2500            0
2    28    2700            0
3    30    3200            0
4    35    3500            0
5    40    3800            1
6    45    4000            1
7    50    4200            0
8    55    4300            0
9    60    4500            1
10   65    5000            1
11   70    5500            0
12   40    3809            1
13   43    3946            1
