# Sample Data generation 

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [7]:
X, y = make_classification(
    n_samples=5000,
    n_features=20,
    n_informative=5,
    n_redundant=2,
    n_clusters_per_class=1,
    weights=[0.95, 0.05],  # 95% majority, 5% minority
    flip_y=0.01,
    random_state=42
)

print("Class distribution:", np.bincount(y))

Class distribution: [4728  272]


In [8]:
feature_names = [f"feature_{i}" for i in range(X.shape[1])]

df = pd.DataFrame(X, columns=feature_names)
df["target"] = y

df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,0.236179,0.81582,0.602165,0.613386,-0.085951,-0.886111,-0.610953,-0.606518,0.941244,0.348102,...,-0.090008,0.619392,-0.456496,-0.762341,-0.68816,0.34434,1.053982,1.236126,-1.342138,0
1,-0.628026,-0.185069,0.951226,-0.006255,-0.650244,-0.171599,0.25539,1.099402,0.175376,0.759842,...,0.328234,0.54255,-0.889682,-1.313641,-0.705418,1.099972,0.763906,0.366379,-1.927125,0
2,1.452648,1.833036,1.694847,1.105225,0.821881,-0.617934,-0.092835,0.231641,2.773789,-0.786718,...,-0.469851,-0.822251,0.259034,0.485787,-2.092048,2.159352,-0.724884,0.725924,-0.824736,0
3,-1.280538,-1.479898,1.100834,0.526953,-0.49136,0.554835,0.253773,2.107121,-0.352507,2.468952,...,-0.14218,-1.245939,-0.030414,-2.615287,0.505393,0.866894,-0.975886,1.594022,-1.841713,0
4,3.705998,-1.03198,1.475879,1.260169,2.254988,-1.175555,-0.462636,1.761419,5.152403,1.575749,...,-1.173701,-0.969839,1.066537,3.850294,2.493364,0.937582,-2.379689,-0.428042,0.031392,0


# spliting data into training and testing set

In [9]:
x = df.drop("target", axis=1)
y = df["target"]    

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# under sampling majority class

In [10]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
x_resampled, y_resampled = rus.fit_resample(x_train, y_train)

print("Before:", np.bincount(y_train))
print("After :", np.bincount(y_resampled))

Before: [3792  208]
After : [208 208]


# Over-sampling minority class

In [12]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)

print("Before:", np.bincount(y_train))
print("After :", np.bincount(y_resampled))

Before: [3792  208]
After : [3792 3792]


# over sampling using smote

In [14]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

print("Before:", np.bincount(y_train))
print("After :", np.bincount(y_resampled))

Before: [3792  208]
After : [3792 3792]


# Ensemble Method(Balanced Random Forest)


In [15]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

brf.fit(x_train, y_train)
y_pred = brf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       936
           1       0.58      0.89      0.70        64

    accuracy                           0.95      1000
   macro avg       0.79      0.92      0.84      1000
weighted avg       0.97      0.95      0.96      1000



# focal loss

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(20, 2)  # 20 features, 2 classes

    def forward(self, x):
        return self.fc(x)

# alpha → gives more importance to minority class
# gamma → forces model to focus on hard examples

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce)
        return (self.alpha * (1 - pt) ** self.gamma * ce).mean()

    
inputs = torch.randn(32, 20)      # 32 samples, 20 features
labels = torch.randint(0, 2, (32,))

model = Model()
loss_fn = FocalLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    optimizer.step()
    print("Loss:", loss.item())





Loss: 0.0810733437538147
Loss: 0.08022473007440567
Loss: 0.07938287407159805
Loss: 0.07854793220758438
Loss: 0.07772009819746017
