In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import joblib

# Load dataset
df = pd.read_csv("../datasets/creditcard_2023.csv")
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

X = df.drop(columns=['Class'])
y = df['Class']

# --------------------------
# Standard Scaling
# --------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler_99999.joblib")

# --------------------------
# PCA to retain 99.999% variance
# --------------------------
pca = PCA(n_components=0.9999, svd_solver='full', random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA components used: {X_pca.shape[1]}")
joblib.dump(pca, "pca_99999_model.joblib")

# ==========================
# Evaluation Helper Function
# ==========================
def evaluate_model(name, true, pred, score_source=None):
    acc = accuracy_score(true, pred)
    auc = roc_auc_score(true, score_source if score_source is not None else pred)
    print(f"\n{name} Evaluation")
    print("=" * len(name))
    print("Accuracy:", round(acc, 4))
    print("ROC AUC:", round(auc, 4))
    print("Confusion Matrix:\n", confusion_matrix(true, pred))
    print("Classification Report:\n", classification_report(true, pred))

# --------------------------
# 1. Autoencoder
# --------------------------
print("\n[Autoencoder]")
input_dim = X_pca.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(int(input_dim/2), activation="relu")(input_layer)
encoder = Dense(int(input_dim/4), activation="relu")(encoder)
decoder = Dense(int(input_dim/2), activation="relu")(encoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')
autoencoder.fit(X_pca, X_pca, epochs=20, batch_size=256, validation_split=0.2, shuffle=True, verbose=0)

autoencoder.save("autoencoder_99999_model.keras")

# Evaluate
reconstructions = autoencoder.predict(X_pca)
mse = np.mean(np.power(X_pca - reconstructions, 2), axis=1)
threshold = np.percentile(mse[y == 0], 95)
auto_preds = (mse > threshold).astype(int)

evaluate_model("Autoencoder", y, auto_preds, mse)



PCA components used: 29

[Autoencoder]
[1m17770/17770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 397us/step

Autoencoder Evaluation
Accuracy: 0.6393
ROC AUC: 0.691
Confusion Matrix:
 [[270099  14216]
 [190871  93444]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.95      0.72    284315
           1       0.87      0.33      0.48    284315

    accuracy                           0.64    568630
   macro avg       0.73      0.64      0.60    568630
weighted avg       0.73      0.64      0.60    568630



In [3]:
# --------------------------
# 2. DBSCAN
# --------------------------
print("\n[DBSCAN]")
dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_pca)
binary_dbscan_labels = (db_labels == -1).astype(int)
joblib.dump(dbscan, "dbscan_99999_model.joblib")

evaluate_model("DBSCAN", y, binary_dbscan_labels)

# --------------------------
# 3. KMeans
# --------------------------
print("\n[KMeans]")
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(X_pca)

# Flip labels if needed
if np.mean(y[kmeans_labels == 0]) > np.mean(y[kmeans_labels == 1]):
    kmeans_labels = 1 - kmeans_labels

joblib.dump(kmeans, "kmeans_99999_model.joblib")

evaluate_model("KMeans", y, kmeans_labels)



[DBSCAN]

DBSCAN Evaluation
Accuracy: 0.3826
ROC AUC: 0.3826
Confusion Matrix:
 [[217568  66747]
 [284315      0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.77      0.55    284315
           1       0.00      0.00      0.00    284315

    accuracy                           0.38    568630
   macro avg       0.22      0.38      0.28    568630
weighted avg       0.22      0.38      0.28    568630


[KMeans]

KMeans Evaluation
Accuracy: 0.8824
ROC AUC: 0.8824
Confusion Matrix:
 [[284254     61]
 [ 66827 217488]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89    284315
           1       1.00      0.76      0.87    284315

    accuracy                           0.88    568630
   macro avg       0.90      0.88      0.88    568630
weighted avg       0.90      0.88      0.88    568630

