In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('creditcard.csv')

from sklearn.preprocessing import StandardScaler, RobustScaler

# FEATURE SCALING EXPLANATION:
#   - Using RobustScaler for Amount is recommended because:
#     1. Transaction amounts often contain extreme outliers (e.g., very large purchases)
#     2. RobustScaler uses median and IQR (Interquartile Range), making it resistant to outliers

df['scaled_amount'] = RobustScaler().fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = df.drop('Class', axis=1)
y = df['Class']

X_base, X_test, y_base, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, average_precision_score, precision_recall_curve
import time


def evaluate_sampling_strategy(sampling_method, X_train, y_train, X_val, y_val):

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        learning_rate=0.01,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.5,
        random_state=42,
        n_jobs=-1,
    )

    if sampling_method == 'SMOTE':
        smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    elif sampling_method == 'Under-Sampling':
        # Notice that the sampling is only on training data, that is why creating df_train rather than
        # using df directly
        df_train = pd.concat([X_train, y_train], axis=1)
        df_normal = df_train[df_train['Class'] == 0]
        df_fraud = df_train[df_train['Class'] == 1]
        df_normal_sampled = df_normal.sample(n=len(df_fraud), random_state=42)
        df_balanced = pd.concat([df_normal_sampled, df_fraud])

        X_resampled = df_balanced.drop('Class', axis=1)
        y_resampled = df_balanced['Class']
    else:
        X_resampled, y_resampled = X_train, y_train

    start_time = time.time()
    model.fit(X_resampled, y_resampled)
    training_time = time.time() - start_time

    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]

    conf_matrix = confusion_matrix(y_val, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    recall = TP / (TP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    pr_auc = average_precision_score(y_val, y_pred_proba)

    return {
        'method': sampling_method,
        'recall': recall,
        'precision': precision,
        'f1': f1,
        'pr_auc': pr_auc,
        'training_time': training_time,
        'confusion_matrix': conf_matrix
    }


In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {'SMOTE':[], 'Under-Sampling':[], 'No-Sampling':[]}

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_base, y_base)):
    print(f'\n{'='*30} Fold {fold_idx} {'='*30}')
    X_train, X_val = X_base.iloc[train_idx], X_base.iloc[val_idx]
    y_train, y_val = y_base.iloc[train_idx], y_base.iloc[val_idx]

    for method in ['SMOTE', 'Under-Sampling', 'No-Sampling']:
        fold_result = evaluate_sampling_strategy(method, X_train, y_train, X_val, y_val)
        results[method].append(fold_result)
        print(f'{method} :Recall = {fold_result['recall']:.4f}, Precision = {fold_result['precision']:.4f}, F1 = {fold_result['f1']:.4f}')


SMOTE :Recall = 0.8974, Precision = 0.1354, F1 = 0.2353
Under-Sampling :Recall = 0.9359, Precision = 0.0385, F1 = 0.0739
No-Sampling :Recall = 0.6667, Precision = 0.9630, F1 = 0.7879

SMOTE :Recall = 0.8101, Precision = 0.1432, F1 = 0.2433
Under-Sampling :Recall = 0.8861, Precision = 0.0459, F1 = 0.0873
No-Sampling :Recall = 0.5190, Precision = 1.0000, F1 = 0.6833

SMOTE :Recall = 0.8608, Precision = 0.1001, F1 = 0.1794
Under-Sampling :Recall = 0.8734, Precision = 0.0589, F1 = 0.1103
No-Sampling :Recall = 0.6835, Precision = 0.9643, F1 = 0.8000

SMOTE :Recall = 0.9241, Precision = 0.1308, F1 = 0.2292
Under-Sampling :Recall = 0.9367, Precision = 0.0460, F1 = 0.0876
No-Sampling :Recall = 0.6582, Precision = 0.9123, F1 = 0.7647

SMOTE :Recall = 0.8987, Precision = 0.1376, F1 = 0.2387
Under-Sampling :Recall = 0.8987, Precision = 0.0415, F1 = 0.0793
No-Sampling :Recall = 0.5823, Precision = 0.9583, F1 = 0.7244


In [5]:
summary_dfs = {}
for method, fold_results in results.items():
    df_method = pd.DataFrame(fold_results)
    df_method['fold'] = range(1, len(fold_results)+1)
    summary_dfs[method] = df_method

    # Print summary
    print(f"\n{method} Performance:")
    print(f"Average Recall: {df_method['recall'].mean():.4f}")
    print(f"Average Precision: {df_method['precision'].mean():.4f}")
    print(f"Average F1-Score: {df_method['f1'].mean():.4f}")
    print(f"Average PR-AUC: {df_method['pr_auc'].mean():.4f}")
    print(f"Average Training Time: {df_method['training_time'].mean():.2f} sec")

# =============================================================================
# FINAL MODEL TRAINING WITH BEST SAMPLING METHOD
# =============================================================================
print("\n" + "="*50)
print("Training Final XGBoost Model with Best Sampling Strategy")
print("="*50)

best_method = max(results, key=lambda k: np.mean([r['f1'] for r in results[k]]))
print(f"Selected best sampling method: {best_method}")

if best_method == 'SMOTE':
    smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5)
    X_resampled, y_resampled = smote.fit_resample(X_base, y_base)
elif best_method == 'Under-sampling':
    df_base = pd.concat([X_base, y_base], axis=1)
    df_normal = df_base[df_base['Class'] == 0]
    df_fraud = df_base[df_base['Class'] == 1]
    df_normal_sampled = df_normal.sample(n=len(df_fraud), random_state=42)
    balanced_df = pd.concat([df_normal_sampled, df_fraud])
    X_resampled = balanced_df.drop('Class', axis=1)
    y_resampled = balanced_df['Class']
else:
    X_resampled, y_resampled = X_base, y_base

# Train final model on full base data
final_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
final_model.fit(X_resampled, y_resampled)
training_time = time.time() - start_time
print(f"Final model trained in {training_time:.2f} seconds")


SMOTE Performance:
Average Recall: 0.8782
Average Precision: 0.1294
Average F1-Score: 0.2252
Average PR-AUC: 0.7346
Average Training Time: 0.49 sec

Under-Sampling Performance:
Average Recall: 0.9062
Average Precision: 0.0461
Average F1-Score: 0.0877
Average PR-AUC: 0.7243
Average Training Time: 0.18 sec

No-Sampling Performance:
Average Recall: 0.6219
Average Precision: 0.9596
Average F1-Score: 0.7521
Average PR-AUC: 0.8207
Average Training Time: 0.31 sec

Training Final XGBoost Model with Best Sampling Strategy
Selected best sampling method: No-Sampling
Final model trained in 1.34 seconds
