In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset
n_samples = 1000
fraud_ratio = 0.05  # 5% fraud

data = {
    "Time": np.random.randint(0, 172800, n_samples),  # up to 48 hrs
    "V1": np.random.normal(0, 1, n_samples),
    "V2": np.random.normal(0, 1, n_samples),
    "V3": np.random.normal(0, 1, n_samples),
    "V4": np.random.normal(0, 1, n_samples),
    "V5": np.random.normal(0, 1, n_samples),
    "Amount": np.random.uniform(1, 500, n_samples),
    "Class": np.random.choice([0, 1], size=n_samples, p=[1-fraud_ratio, fraud_ratio])
}

df = pd.DataFrame(data)

print(df.head())
print("\nClass distribution:\n", df["Class"].value_counts())


     Time        V1        V2        V3        V4        V5      Amount  Class
0  121958 -2.289061 -1.313758 -0.452562 -0.392802  0.224787  160.734960      0
1  146867  1.432482 -1.095302 -0.129910 -1.362911 -1.017335  419.338099      0
2  131932  1.214722 -0.168797  0.581433  0.699020  0.964415  327.460759      0
3  103694 -0.880864  0.110956 -0.203236 -0.243510  0.362543  138.439522      0
4  119879 -0.881879  0.786970  1.110118  0.015365 -1.135162  483.902237      0

Class distribution:
 Class
0    945
1     55
Name: count, dtype: int64


In [2]:
X = df.drop("Class", axis=1)
y = df["Class"]


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_res.value_counts())


Before SMOTE: Class
0    756
1     44
Name: count, dtype: int64
After SMOTE: Class
0    756
1    756
Name: count, dtype: int64


In [5]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_res, y_res)


In [6]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))


[[173  16]
 [  8   3]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       189
           1       0.16      0.27      0.20        11

    accuracy                           0.88       200
   macro avg       0.56      0.59      0.57       200
weighted avg       0.91      0.88      0.89       200

ROC-AUC Score: 0.594035594035594
