In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/card_transdata.csv"
data = pd.read_csv(url)
print(data.head())
print(data['fraud'].value_counts())  

print(f"Fraud ratio: {data['fraud'].mean():.3f}")  

In [None]:
#SPLIT DATA
X = data.drop('fraud', axis=1)
y = data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train fraud ratio: {y_train.mean():.3f}")

In [None]:
#MODEL BASELINE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_base = RandomForestClassifier(random_state=42)
rf_base.fit(X_train, y_train)
y_pred_base = rf_base.predict(X_test)

print("BASELINE (sem resampling):")
print(classification_report(y_test, y_pred_base))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_base))

In [None]:

#UNDERSAMPLING
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print(f"Undersampling - Train shape: {X_train_rus.shape}, Fraud ratio: {y_train_rus.mean():.3f}")

rf_rus = RandomForestClassifier(random_state=42)
rf_rus.fit(X_train_rus, y_train_rus)
y_pred_rus = rf_rus.predict(X_test)
print("\nUNDERSAMPLING:")
print(classification_report(y_test, y_pred_rus))

In [None]:
#SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"SMOTE - Train shape: {X_train_smote.shape}")

rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)
print("\nSMOTE (BEST):")
print(classification_report(y_test, y_pred_smote))