In [6]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load your uploaded dataset
df = pd.read_csv("creditcard.csv")

# Drop rows with missing target values
df = df.dropna(subset=['Class'])

# Convert target column to integer
df['Class'] = df['Class'].astype(int)

# Check class distribution
print("Original class counts:", Counter(df['Class']))

# Entropy function
def entropy(y):
    vals, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-12))

# Entropy of original (imbalanced)
entropy_orig = entropy(df['Class'])
print("Entropy (Original / Imbalanced):", entropy_orig)

# -----------------------
# 1️⃣ UNDERSAMPLING
# -----------------------
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]
df_majority_down = resample(df_majority,
                            replace=False,
                            n_samples=len(df_minority),
                            random_state=42)
df_under = pd.concat([df_majority_down, df_minority])
print("Under-sampled class counts:", Counter(df_under['Class']))
entropy_under = entropy(df_under['Class'])
print("Entropy (Undersampled):", entropy_under)

# -----------------------
# 2️⃣ OVERSAMPLING
# -----------------------
df_minority_up = resample(df_minority,
                          replace=True,
                          n_samples=len(df_majority),
                          random_state=42)
df_over = pd.concat([df_majority, df_minority_up])
print("Over-sampled class counts:", Counter(df_over['Class']))
entropy_over = entropy(df_over['Class'])
print("Entropy (Oversampled):", entropy_over)

# -----------------------
# 3️⃣ SMOTE
# -----------------------
X = df.drop('Class', axis=1)
y = df['Class']
sm = SMOTE(random_state=42)
X_smote, y_smote = sm.fit_resample(X, y)
print("SMOTE-sampled class counts:", Counter(y_smote))
entropy_smote = entropy(y_smote)
print("Entropy (SMOTE):", entropy_smote)

# -----------------------
# 4️⃣ PURE DATASET
# -----------------------
y_pure = df_majority['Class']
entropy_pure = entropy(y_pure)
print("Entropy (Pure non-fraud):", entropy_pure)

# -----------------------
# Summary
# -----------------------
results = {
    "Dataset": ["Original", "Undersampled", "Oversampled", "SMOTE", "Pure"],
    "Entropy": [entropy_orig, entropy_under, entropy_over, entropy_smote, entropy_pure],
    "Class Counts": [
        Counter(df['Class']),
        Counter(df_under['Class']),
        Counter(df_over['Class']),
        Counter(y_smote),
        Counter(y_pure)
    ]
}

summary = pd.DataFrame(results)
print("\nSummary:\n", summary)


Original class counts: Counter({0: 284315, 1: 492})
Entropy (Original / Imbalanced): 0.01834340770695327
Under-sampled class counts: Counter({0: 492, 1: 492})
Entropy (Undersampled): 0.9999999999971146
Over-sampled class counts: Counter({0: 284315, 1: 284315})
Entropy (Oversampled): 0.9999999999971146
SMOTE-sampled class counts: Counter({0: 284315, 1: 284315})
Entropy (SMOTE): 0.9999999999971146
Entropy (Pure non-fraud): -1.4428232973175175e-12

Summary:
         Dataset       Entropy            Class Counts
0      Original  1.834341e-02     {0: 284315, 1: 492}
1  Undersampled  1.000000e+00        {0: 492, 1: 492}
2   Oversampled  1.000000e+00  {0: 284315, 1: 284315}
3         SMOTE  1.000000e+00  {0: 284315, 1: 284315}
4          Pure -1.442823e-12             {0: 284315}
