In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

def evaluate(y_true, y_pred, model_name="Model"):
    print(f'\n{model_name} Evaluation:')
    print(f'Accuracy: {accuracy_score(y_true, y_pred):.4f}')
    print(f'Precision: {precision_score(y_true, y_pred):.4f}')
    print(f'Recall: {recall_score(y_true, y_pred):.4f}')
    print(f'F1-score: {f1_score(y_true, y_pred):.4f}')
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=["Normal", "Fraud"]))

# Load and preprocess data
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(columns=['Time'])
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).values
y = data['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Isolation Forest scores
iso = IsolationForest(n_estimators=200, contamination=0.0017, random_state=RANDOM_SEED)
iso.fit(X_train_std[y_train == 0])
score_train = iso.decision_function(X_train_std).reshape(-1, 1)
score_test = iso.decision_function(X_test_std).reshape(-1, 1)

# PCA features
pca = PCA(n_components=12, random_state=RANDOM_SEED)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# Combine features
X_train_fused = np.hstack([X_train_std, score_train, X_train_pca])
X_test_fused = np.hstack([X_test_std, score_test, X_test_pca])

# XGBoost classifier
model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.6,
    colsample_bytree=1.0,
    min_child_weight=5,
    gamma=0.5,
    alpha=0.1,
    scale_pos_weight=3,
    use_label_encoder=False,
    eval_metric='aucpr',
    tree_method='hist',
    random_state=RANDOM_SEED
)
model.fit(X_train_fused, y_train)

# Threshold optimization
y_prob = model.predict_proba(X_test_fused)[:, 1]
prec, rec, thresholds = precision_recall_curve(y_test, y_prob)
f1 = 2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-9)
opt_thresh = thresholds[np.argmax(f1)]

# Final prediction
y_pred = (y_prob > opt_thresh).astype(int)
evaluate(y_test, y_pred, model_name="Hybrid")


Parameters: { "use_label_encoder" } are not used.




Hybrid Evaluation:
Accuracy: 0.9995
Precision: 0.9127
Recall: 0.7770
F1-score: 0.8394

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     85295
       Fraud       0.91      0.78      0.84       148

    accuracy                           1.00     85443
   macro avg       0.96      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443

