In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import joblib

# Load and prepare data
df = pd.read_csv("../datasets/creditcard_2023.csv")
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

X = df.drop(columns=['Class'])
y = df['Class']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler_pca.joblib")

# --------------------------
# Apply PCA
# --------------------------
pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X)
print("Explained Variance Ratio (PCA):", pca.explained_variance_ratio_.sum())

# Save PCA
joblib.dump(pca, "pca_model.joblib")

# --------------------------
# 1. Autoencoder on PCA Data
# --------------------------
print("\n[Autoencoder]")
input_dim = X_pca.shape[1]

input_layer = Input(shape=(input_dim,))
encoder = Dense(6, activation="relu")(input_layer)
encoder = Dense(3, activation="relu")(encoder)
decoder = Dense(6, activation="relu")(encoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

autoencoder.fit(X_pca, X_pca, epochs=20, batch_size=256, validation_split=0.2, shuffle=True, verbose=0)

# Save Autoencoder
autoencoder.save("autoencoder_pca_model.keras")

# Predictions and evaluation
reconstructions = autoencoder.predict(X_pca)
mse = np.mean(np.power(X_pca - reconstructions, 2), axis=1)
threshold = np.percentile(mse[y == 0], 95)
auto_preds = (mse > threshold).astype(int)

print("Autoencoder Confusion Matrix:")
print(confusion_matrix(y, auto_preds))
print("Autoencoder ROC AUC:", round(roc_auc_score(y, mse), 4))

# --------------------------
# 2. DBSCAN on PCA Data
# --------------------------
print("\n[DBSCAN]")
dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_pca)
binary_dbscan_labels = (db_labels == -1).astype(int)

print("DBSCAN Confusion Matrix:")
print(confusion_matrix(y, binary_dbscan_labels))
print("DBSCAN ROC AUC:", round(roc_auc_score(y, binary_dbscan_labels), 4))

# Save DBSCAN model
joblib.dump(dbscan, "dbscan_pca_model.joblib")

# --------------------------
# 3. KMeans on PCA Data
# --------------------------
print("\n[KMeans]")
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(X_pca)

# Flip label if needed
if np.mean(y[kmeans_labels == 0]) > np.mean(y[kmeans_labels == 1]):
    kmeans_labels = 1 - kmeans_labels

print("KMeans Confusion Matrix:")
print(confusion_matrix(y, kmeans_labels))
print("KMeans ROC AUC:", round(roc_auc_score(y, kmeans_labels), 4))

# Save KMeans model
joblib.dump(kmeans, "kmeans_pca_model.joblib")


Explained Variance Ratio (PCA): 0.9999998630333607

[Autoencoder]
[1m17770/17770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 371us/step
Autoencoder Confusion Matrix:
[[270099  14216]
 [270480  13835]]
Autoencoder ROC AUC: 0.4983

[DBSCAN]
DBSCAN Confusion Matrix:
[[ 47558 236757]
 [  3708 280607]]
DBSCAN ROC AUC: 0.5771

[KMeans]
KMeans Confusion Matrix:
[[142723 141592]
 [141950 142365]]
KMeans ROC AUC: 0.5014


['kmeans_pca_model.joblib']