# Sampling Assignment (Imbalanced Classification)

##Arushi Khanna
## 102317147

In [1]:
# 1
import pandas as pd

dataset="/content/Creditcard_data.csv"
df = pd.read_csv(dataset)

print(df.shape)
df.head()


(772, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
# 2
from imblearn.over_sampling import SMOTE

X = df.drop("Class", axis=1)
y = df["Class"]

print("Before:\n", y.value_counts())

smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

print("\nAfter:\n", pd.Series(y_bal).value_counts())


Before:
 Class
0    763
1      9
Name: count, dtype: int64

After:
 Class
0    763
1    763
Name: count, dtype: int64


In [3]:
# 3
from sklearn.model_selection import train_test_split

seeds = [11, 22, 33, 44, 55]
samples = []

for seed in seeds:
    X_train, X_test, y_train, y_test = train_test_split(
        X_bal, y_bal, test_size=0.25, random_state=seed, stratify=y_bal
    )
    samples.append((X_train, X_test, y_train, y_test))

print("Total samples created:", len(samples))


Total samples created: 5


In [6]:
# 4
import numpy as np

data_bal = X_bal.copy()
data_bal["Class"] = y_bal

N = len(data_bal)

# Sample size (you can keep 60% for each sample)
n_sample = int(0.60 * N)

# 1) Simple Random Sampling
sample_simple_random = data_bal.sample(n=n_sample, random_state=1)

# 2) Systematic Sampling
k = max(1, N // n_sample)
sample_systematic = data_bal.iloc[::k].head(n_sample)

# 3) Stratified Sampling (equal pick from both classes)
sample_stratified = (
    data_bal.groupby("Class", group_keys=False)
    .apply(lambda x: x.sample(n=n_sample//2, random_state=1))
)

# 4) Cluster Sampling (clusters made using bins of Amount)
# (works simply and is easy to explain)
data_bal["Amount_bin"] = pd.qcut(data_bal["Amount"], q=5, duplicates="drop")
clusters = data_bal["Amount_bin"].unique()

np.random.seed(1)
chosen_cluster = np.random.choice(clusters, size=2, replace=False)

sample_cluster = data_bal[data_bal["Amount_bin"].isin(chosen_cluster)].drop(columns=["Amount_bin"])
sample_cluster = sample_cluster.sample(n=min(n_sample, len(sample_cluster)), random_state=1)

data_bal = data_bal.drop(columns=["Amount_bin"])

# 5) Bootstrap Sampling (sampling with replacement)
sample_bootstrap = data_bal.sample(n=n_sample, replace=True, random_state=1)

samples = {
    "Simple Random Sampling": sample_simple_random,
    "Systematic Sampling": sample_systematic,
    "Stratified Sampling": sample_stratified,
    "Cluster Sampling": sample_cluster,
    "Bootstrap Sampling": sample_bootstrap
}

for name, s in samples.items():
    print(name, "->", s.shape)


Simple Random Sampling -> (915, 31)
Systematic Sampling -> (915, 31)
Stratified Sampling -> (914, 31)
Cluster Sampling -> (610, 31)
Bootstrap Sampling -> (915, 31)


  .apply(lambda x: x.sample(n=n_sample//2, random_state=1))


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=2000))
    ]),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier(n_neighbors=7))
    ]),
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(probability=True, random_state=42))
    ])
}

import pandas as pd

acc_table = pd.DataFrame(index=models.keys(), columns=samples.keys())

for model_name, model in models.items():
    for sample_name, sample_df in samples.items():

        X_s = sample_df.drop("Class", axis=1)
        y_s = sample_df["Class"]

        X_train, X_test, y_train, y_test = train_test_split(
            X_s, y_s, test_size=0.2, random_state=42, stratify=y_s
        )

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        acc_table.loc[model_name, sample_name] = round(acc * 100, 2)

acc_table


Unnamed: 0,Simple Random Sampling,Systematic Sampling,Stratified Sampling,Cluster Sampling,Bootstrap Sampling
Logistic Regression,93.44,92.35,92.35,93.44,93.99
Decision Tree,97.27,96.72,94.54,97.54,98.36
Random Forest,99.45,99.45,99.45,99.18,99.45
KNN,90.16,95.63,90.71,95.9,95.08
SVM,97.27,98.91,97.81,97.54,99.45


In [8]:
# 5
best_sampling = acc_table.astype(float).idxmax(axis=1)
best_accuracy = acc_table.astype(float).max(axis=1)

final_result = pd.DataFrame({
    "Best_Sampling": best_sampling,
    "Best_Accuracy(%)": best_accuracy
})

final_result


Unnamed: 0,Best_Sampling,Best_Accuracy(%)
Logistic Regression,Bootstrap Sampling,93.99
Decision Tree,Bootstrap Sampling,98.36
Random Forest,Simple Random Sampling,99.45
KNN,Cluster Sampling,95.9
SVM,Bootstrap Sampling,99.45


This assignment uses the Credit Card Fraud dataset where:

Class = 0 → Normal transaction

Class = 1 → Fraud transaction

The dataset is imbalanced, meaning fraud cases are very less compared to normal cases. Because of this, models may give wrong results by predicting mostly Class = 0.

So first, we made the dataset balanced using Random UnderSampling, where we randomly reduced the normal transactions to match fraud transactions.

After balancing, we created 5 samples using probabilistic sampling methods:

Simple Random Sampling: randomly picks records

Stratified Sampling: keeps same class ratio in sample

Systematic Sampling: selects every k-th record

Cluster Sampling: divides into groups and picks some groups randomly

Bootstrap Sampling: random sampling with replacement

Then, we trained 5 models (M1 to M5) on each sample and calculated accuracy.

Conclusion

Different sampling methods give different accuracy results.
So, we compare all results and find:

Best sampling method for each model

Best model for each sampling method