In [10]:
import pandas as pd
from sklearn.utils import resample
df = pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['high', 'low', 'low', 'high', 'low', 'high', 'high', 'low', 'low', 'high', 'low', 'high']
})
print(df)

    age  income class
0    22    2000  high
1    25    2500   low
2    28    2700   low
3    30    3200  high
4    35    3500   low
5    40    3800  high
6    45    4000  high
7    50    4200   low
8    55    4300   low
9    60    4500  high
10   65    5000   low
11   70    5500  high


In [11]:
df_high=df[df['class']=='high']
df_low=df[df['class']=='low']
print(df_high)
print(df_low)

    age  income class
0    22    2000  high
3    30    3200  high
5    40    3800  high
6    45    4000  high
9    60    4500  high
11   70    5500  high
    age  income class
1    25    2500   low
2    28    2700   low
4    35    3500   low
7    50    4200   low
8    55    4300   low
10   65    5000   low


In [12]:
df_high_downsample=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)
print(df_high_downsample)

    age  income class
0    22    2000  high
3    30    3200  high
11   70    5500  high
5    40    3800  high
9    60    4500  high
6    45    4000  high


In [13]:
df_balanced=pd.concat([df_high_downsample,df_low])
print(df_balanced)
print(df_balanced['class'].value_counts())

    age  income class
0    22    2000  high
3    30    3200  high
11   70    5500  high
5    40    3800  high
9    60    4500  high
6    45    4000  high
1    25    2500   low
2    28    2700   low
4    35    3500   low
7    50    4200   low
8    55    4300   low
10   65    5000   low
class
high    6
low     6
Name: count, dtype: int64


In [14]:
df1=pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['min', 'maj', 'maj', 'maj', 'maj', 'min', ',min', 'maj', 'min', 'maj', 'min', 'maj']
})
df1


Unnamed: 0,age,income,class
0,22,2000,min
1,25,2500,maj
2,28,2700,maj
3,30,3200,maj
4,35,3500,maj
5,40,3800,min
6,45,4000,",min"
7,50,4200,maj
8,55,4300,min
9,60,4500,maj


In [19]:
df_maj=df1[df1['class']=='maj']
print(df_maj)
df_min=df1[df1['class']=='min']
print(df_min)

    age  income class
1    25    2500   maj
2    28    2700   maj
3    30    3200   maj
4    35    3500   maj
7    50    4200   maj
9    60    4500   maj
11   70    5500   maj
    age  income class
0    22    2000   min
5    40    3800   min
8    55    4300   min
10   65    5000   min


In [25]:
# Upsampling the minority class to match the size of the majority class
df_min_upsample = resample(df_min, replace=True, n_samples=len(df_maj), random_state=42)
df_min_upsample

Unnamed: 0,age,income,class
8,55,4300,min
10,65,5000,min
0,22,2000,min
8,55,4300,min
8,55,4300,min
10,65,5000,min
0,22,2000,min


In [31]:
# Combining back to create a balanced dataset
df_balanced = pd.concat([df_maj, df_min_upsample])
print(df_balanced)
print(df_balanced['class'].value_counts())

    age  income class
1    25    2500   maj
2    28    2700   maj
3    30    3200   maj
4    35    3500   maj
7    50    4200   maj
9    60    4500   maj
11   70    5500   maj
8    55    4300   min
10   65    5000   min
0    22    2000   min
8    55    4300   min
8    55    4300   min
10   65    5000   min
0    22    2000   min
class
maj    7
min    7
Name: count, dtype: int64


In [27]:
# Shuffle the dataset to mix up the rows
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new balanced DataFrame
print(df_balanced)

    age  income class
0    22    2000   min
1    55    4300   min
2    25    2500   maj
3    65    5000   min
4    60    4500   maj
5    65    5000   min
6    30    3200   maj
7    28    2700   maj
8    22    2000   min
9    50    4200   maj
10   55    4300   min
11   55    4300   min
12   35    3500   maj
13   70    5500   maj


In [1]:
pip install imbalanced-learn




In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Fixing the dataset
df1 = pd.DataFrame({
    'age': [22, 25, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500],
    'class': ['min', 'maj', 'maj', 'maj', 'maj', 'min', 'min', 'maj', 'min', 'maj', 'min', 'maj']  # Fixed ',min'
})

# Encoding 'class' column: Convert categorical to numerical (SMOTE requires numerical labels)
df1['class'] = df1['class'].map({'min': 0, 'maj': 1})

# Splitting features and target
X = df1[['age', 'income']]  # Features
y = df1['class']  # Target

# Applying SMOTE to balance classes
smote = SMOTE(sampling_strategy='auto', random_state=42,k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)
y_resampled= y_resampled.map({0:'maj',1:'min'})

# Creating the new balanced DataFrame
df_balanced =pd.concat([pd.DataFrame(X_resampled, columns=['age', 'income']),pd.DataFrame(y_resampled, columns=['class'])])
print(df_balanced['class'].value_counts())
# Display the new balanced dataset
print(df_balanced)


class
maj    7
min    7
Name: count, dtype: int64
     age  income class
0   22.0  2000.0   NaN
1   25.0  2500.0   NaN
2   28.0  2700.0   NaN
3   30.0  3200.0   NaN
4   35.0  3500.0   NaN
5   40.0  3800.0   NaN
6   45.0  4000.0   NaN
7   50.0  4200.0   NaN
8   55.0  4300.0   NaN
9   60.0  4500.0   NaN
10  65.0  5000.0   NaN
11  70.0  5500.0   NaN
12  40.0  3809.0   NaN
13  43.0  3946.0   NaN
0    NaN     NaN   maj
1    NaN     NaN   min
2    NaN     NaN   min
3    NaN     NaN   min
4    NaN     NaN   min
5    NaN     NaN   maj
6    NaN     NaN   maj
7    NaN     NaN   min
8    NaN     NaN   maj
9    NaN     NaN   min
10   NaN     NaN   maj
11   NaN     NaN   min
12   NaN     NaN   maj
13   NaN     NaN   maj
