# Step - 1 Generate Synthetic Dataset

In [2]:
# Import libraries for step-1

from sklearn.datasets import make_classification

import numpy as np
import pandas as pd

from collections import Counter

In [3]:
# Create data

X,y = make_classification(\
    n_samples = 1000,   # 1000 datapoints
    n_features = 10,    # 10 columns
    n_informative = 5,  # 5 features useful for predictions
    n_redundant= 2,
    n_repeated = 0,  # no repeatition in data points
    n_classes = 2,   # 2 classes
    weights=[0.9,0.1],  # 90% belongs to 1 class
    random_state=42        # keep the output everytime we run the code.
)
# Convert to Dataframe

data = pd.DataFrame(X,columns = [f"Feature_{i}" for i in range(10)])
print(data.head())

data['Target']=y
# Data distribution

print("\n")

print(Counter(data['Target']))

   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0   1.108936   1.540492   0.493516  -0.529740   0.982175   1.177544   
1  -0.564641   3.638629  -1.522415  -1.541705   1.616697   4.781310   
2   0.516313   2.165426  -0.628486  -0.386923   0.492518   1.442381   
3   1.476534   0.548523  -0.115420  -0.875408   1.301216   0.410295   
4   0.278385   1.065828  -1.724917  -2.235667   0.715107   0.731249   

   Feature_6  Feature_7  Feature_8  Feature_9  
0   1.623025   1.357325   0.966041  -0.504924  
1   3.190292  -0.890254   1.438826  -3.828748  
2   1.332905  -1.958175  -0.348803  -1.804124  
3   1.171878  -1.034471  -1.654176   1.344601  
4  -0.674119   0.598330  -0.524283   1.047610  


Counter({0: 896, 1: 104})


# Step - 2 Train a Logistic Regression Model on imabalanced dataset to check how it performs


In [4]:
# import libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

In [47]:
# Split the data in X and y

X = data.drop("Target",axis=1)
y = data['Target']

# Split the data in train and test set

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,
                                                 random_state=42)

print(X_train.shape,y_train.shape)
print("\n")
print(X_test.shape,y_test.shape)

print(Counter(y_train))

(700, 10) (700,)


(300, 10) (300,)
Counter({0: 627, 1: 73})


In [6]:
# Train and Evaluate the Model

model = LogisticRegression()
model.fit(X_train,y_train)
y_pred= model.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print("\n")
print(classification_report(y_test,y_pred))

[[263   6]
 [ 22   9]]


              precision    recall  f1-score   support

           0       0.92      0.98      0.95       269
           1       0.60      0.29      0.39        31

    accuracy                           0.91       300
   macro avg       0.76      0.63      0.67       300
weighted avg       0.89      0.91      0.89       300



In [7]:
# High accuracy but bad recall for minority class.
# need to imbalance to avoid this type of situation.

# Step - 3 Apply Imbalance Handling Methods

**Method** 1 : Random OverSampling

Duplicate minority class to balance the dataset

In [27]:

from imblearn.over_sampling import RandomOverSampler

ROS = RandomOverSampler(random_state=42)

x_ROS,y_ROS = ROS.fit_resample(X_train,y_train)

print(Counter(y_ROS))

print("\n")

# fit model

model.fit(x_ROS,y_ROS)

y_pred_ROS = model.predict(X_test)

# predict value using original test data in order to get fair results

print(confusion_matrix(y_test,y_pred_ROS))
print("\n")

print(classification_report(y_test,y_pred_ROS))

Counter({0: 627, 1: 627})


[[198  71]
 [  4  27]]


              precision    recall  f1-score   support

           0       0.98      0.74      0.84       269
           1       0.28      0.87      0.42        31

    accuracy                           0.75       300
   macro avg       0.63      0.80      0.63       300
weighted avg       0.91      0.75      0.80       300



**Method** 2 : Random UnderSampling

Removes Majority class samples to match the number of Minority class.

In [26]:

from imblearn.under_sampling import RandomUnderSampler

ROS = RandomUnderSampler(random_state=42)

x_RUS,y_RUS = ROS.fit_resample(X_train,y_train)

print(Counter(y_RUS))
print("\n")

# fit the model

model.fit(x_RUS,y_RUS)

y_pred_RUS = model.predict(X_test)

print(confusion_matrix(y_test,y_pred_RUS))

print("\n")

print(classification_report(y_test,y_pred_RUS))

Counter({0: 73, 1: 73})


[[185  84]
 [  5  26]]


              precision    recall  f1-score   support

           0       0.97      0.69      0.81       269
           1       0.24      0.84      0.37        31

    accuracy                           0.70       300
   macro avg       0.61      0.76      0.59       300
weighted avg       0.90      0.70      0.76       300



**Method** 3 : Synthetic Minority Oversampling Technique (SMOTE)

it creates synthetic points of minority class based on feature space interpolation

In [25]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_smote,y_smote=smote.fit_resample(X_train,y_train)

print(Counter(y_smote))

print("\n")
# fit the model

model.fit(X_smote,y_smote)

y_pred_smote=model.predict(X_test)

print(confusion_matrix(y_test,y_pred_smote))

print("\n")

print(classification_report(y_test,y_pred_smote))

Counter({0: 627, 1: 627})


[[205  64]
 [  4  27]]


              precision    recall  f1-score   support

           0       0.98      0.76      0.86       269
           1       0.30      0.87      0.44        31

    accuracy                           0.77       300
   macro avg       0.64      0.82      0.65       300
weighted avg       0.91      0.77      0.81       300



**Method** 4 : Adaptive Synthetic Sampling (ADASYN)


In [33]:
from imblearn.over_sampling import ADASYN

adaptive = ADASYN(random_state=42)

X_adaptive,y_adaptive = adaptive.fit_resample(X_train,y_train)

print(Counter(y_adaptive))
print("\n")

model.fit(X_adaptive,y_adaptive)

y_pred_adaptive = model.predict(X_test)

print(confusion_matrix(y_test,y_pred_adaptive))
print("\n")

print(classification_report(y_test,y_pred_adaptive))


Counter({0: 627, 1: 621})


[[193  76]
 [  4  27]]


              precision    recall  f1-score   support

           0       0.98      0.72      0.83       269
           1       0.26      0.87      0.40        31

    accuracy                           0.73       300
   macro avg       0.62      0.79      0.62       300
weighted avg       0.91      0.73      0.78       300



**Method** 5 : NeareMiss (UnderSampling)

select Majority points which are closest to Minority points


In [42]:
from imblearn.under_sampling import NearMiss

NM = NearMiss()

X_NM,y_NM = NM.fit_resample(X_train,y_train)

print(Counter(y_NM))
print("\n")

model.fit(X_NM,y_NM)

y_pred_NM = model.predict(X_test)

print(confusion_matrix(y_test,y_pred_NM))
print("\n")

print(classification_report(y_test,y_pred_NM))


Counter({0: 73, 1: 73})


[[180  89]
 [  9  22]]


              precision    recall  f1-score   support

           0       0.95      0.67      0.79       269
           1       0.20      0.71      0.31        31

    accuracy                           0.67       300
   macro avg       0.58      0.69      0.55       300
weighted avg       0.87      0.67      0.74       300



**Method** 6 : TomekLinks

Removes Overlapping points between classes.

In [45]:
from imblearn.under_sampling import TomekLinks

TL = TomekLinks()

X_TL,y_TL = TL.fit_resample(X_train,y_train)

print(Counter(y_TL))
print("\n")

model.fit(X_TL,y_TL)

y_pred_TL = model.predict(X_test)

print(confusion_matrix(y_test,y_pred_TL))

print("\n")


print(classification_report(y_test,y_pred_TL))

Counter({0: 622, 1: 73})


[[263   6]
 [ 22   9]]


              precision    recall  f1-score   support

           0       0.92      0.98      0.95       269
           1       0.60      0.29      0.39        31

    accuracy                           0.91       300
   macro avg       0.76      0.63      0.67       300
weighted avg       0.89      0.91      0.89       300



Method 7 -Edited Nearest Neighbors (ENN)

Removes majority points misclassified by k-nearest neighbors.

In [46]:
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_train, y_train)
print(Counter(y_enn))

print("\n")

model.fit(X_enn,y_enn)

y_pred_enn = model.predict(X_test)

print(confusion_matrix(y_test,y_pred_enn))

print("\n")


print(classification_report(y_test,y_pred_enn))

Counter({0: 594, 1: 73})


[[261   8]
 [ 21  10]]


              precision    recall  f1-score   support

           0       0.93      0.97      0.95       269
           1       0.56      0.32      0.41        31

    accuracy                           0.90       300
   macro avg       0.74      0.65      0.68       300
weighted avg       0.89      0.90      0.89       300



#Summary of Synthetic Dataset

###Primary Method: SMOTE → balances the dataset, improves minority recall, ###avoids overfitting caused by simple duplication.
###(Precision :- 0.30, Recall :- 0.87)

###Other Option: ADASYN → if we want  focus more on “hard” minority samples.
###(Precision :- 0.26, Recall :- 0.87)

###Out of 7 methods applied on Synthetic dataset above 2 should be the primary ###and secondary choice respectively due to the best result of precision and ###recall.