In [1]:
!pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.cluster import DBSCAN
from xgboost import XGBClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

# Load dataset
df = pd.read_csv("../datasets/creditcard_2023.csv")

if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

X = df.drop(columns=['Class'])
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

# Save the scaler
joblib.dump(scaler, "scaler.joblib")

# --------------------------
# Supervised Models
# --------------------------
models = {
    "logistic_regression": LogisticRegression(max_iter=1000),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier(n_estimators=100),
    "xgboost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n{name.replace('_', ' ').title()}")
    print("-" * 30)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("ROC AUC:", round(roc_auc_score(y_test, y_prob), 4))
    
    # Save the model
    joblib.dump(model, f"{name}.joblib")



# --------------------------
# Autoencoder (Unsupervised)
# --------------------------
print("\nAutoencoder Anomaly Detection")

input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(input_dim, activation="sigmoid")(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')

autoencoder.fit(X_train, X_train, epochs=10, batch_size=256, shuffle=True, validation_split=0.2, verbose=1)




Logistic Regression
------------------------------
Confusion Matrix:
[[83422  1873]
 [ 4139 81155]]
ROC AUC: 0.9936

Decision Tree
------------------------------
Confusion Matrix:
[[85005   290]
 [  118 85176]]
ROC AUC: 0.9976

Random Forest
------------------------------
Confusion Matrix:
[[85264    31]
 [    0 85294]]
ROC AUC: 1.0


Parameters: { "use_label_encoder" } are not used.




Xgboost
------------------------------
Confusion Matrix:
[[85242    53]
 [    0 85294]]
ROC AUC: 1.0

DBSCAN Clustering
Unique Cluster Labels: [ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
 197 198 199 200 201 202 203 204 205 206 207 208 209 

ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=autoencoder_model.

In [4]:
# Save Autoencoder
autoencoder.save("autoencoder_model.keras")

# Evaluate
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
threshold = np.percentile(mse[y_test == 0], 95)
preds = (mse > threshold).astype(int)

print("Confusion Matrix:")
print(confusion_matrix(y_test, preds))
print("ROC AUC:", round(roc_auc_score(y_test, mse), 4))


[1m5331/5331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 394us/step
Confusion Matrix:
[[81030  4265]
 [61601 23693]]
ROC AUC: 0.6537


In [5]:
# Convert to binary: 0 = cluster (legit), 1 = noise (possible fraud)
binary_labels = (dbscan_labels == -1).astype(int)

print("DBSCAN Clustering (binary: noise vs. cluster)")
print("Cluster Counts:", np.bincount(binary_labels))

# Evaluate against actual labels if available
print("Confusion Matrix:")
print(confusion_matrix(y, binary_labels))
print("Classification Report:")
print(classification_report(y, binary_labels))
print("ROC AUC Score:", round(roc_auc_score(y, binary_labels), 4))

DBSCAN Clustering (binary: noise vs. cluster)
Cluster Counts: [542636  25994]
Confusion Matrix:
[[258321  25994]
 [284315      0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.91      0.62    284315
           1       0.00      0.00      0.00    284315

    accuracy                           0.45    568630
   macro avg       0.24      0.45      0.31    568630
weighted avg       0.24      0.45      0.31    568630

ROC AUC Score: 0.4543


In [6]:
# --------------------------
# DBSCAN (Unsupervised)
# --------------------------
print("\nDBSCAN Clustering")
dbscan = DBSCAN(eps=2, min_samples=500)
dbscan_labels = dbscan.fit_predict(X_scaled)
print("Unique Cluster Labels:", np.unique(dbscan_labels))

# Save DBSCAN
joblib.dump(dbscan, "dbscan_model.joblib")


DBSCAN Clustering
Unique Cluster Labels: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13]


['dbscan_model.joblib']

In [7]:
# Convert to binary: 0 = cluster (legit), 1 = noise (possible fraud)
binary_labels = (dbscan_labels == -1).astype(int)

print("DBSCAN Clustering (binary: noise vs. cluster)")
print("Cluster Counts:", np.bincount(binary_labels))

# Evaluate against actual labels if available
print("Confusion Matrix:")
print(confusion_matrix(y, binary_labels))
print("Classification Report:")
print(classification_report(y, binary_labels))
print("ROC AUC Score:", round(roc_auc_score(y, binary_labels), 4))

DBSCAN Clustering (binary: noise vs. cluster)
Cluster Counts: [420113 148517]
Confusion Matrix:
[[170154 114161]
 [249959  34356]]
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.60      0.48    284315
           1       0.23      0.12      0.16    284315

    accuracy                           0.36    568630
   macro avg       0.32      0.36      0.32    568630
weighted avg       0.32      0.36      0.32    568630

ROC AUC Score: 0.3597


In [None]:
# Load the scaler and models
scaler = joblib.load("scaler.joblib")
logistic_model = joblib.load("logistic_regression.joblib")
decision_tree_model = joblib.load("decision_tree.joblib")
random_forest_model = joblib.load("random_forest.joblib")
xgboost_model = joblib.load("xgboost.joblib")
dbscan_model = joblib.load("dbscan_model.joblib")
autoencoder_model = load_model("autoencoder_model")

# Load new or test data
df = pd.read_csv("creditcard_2023.csv")
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

X = df.drop(columns=['Class'])
y = df['Class']
X_scaled = scaler.transform(X)

# --- Supervised Models Prediction ---
models = {
    "Logistic Regression": logistic_model,
    "Decision Tree": decision_tree_model,
    "Random Forest": random_forest_model,
    "XGBoost": xgboost_model,
}

for name, model in models.items():
    y_pred = model.predict(X_scaled)
    y_prob = model.predict_proba(X_scaled)[:, 1]
    
    print(f"\n{name}")
    print("-" * len(name))
    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred))
    print("Classification Report:")
    print(classification_report(y, y_pred))
    print("ROC AUC Score:", round(roc_auc_score(y, y_prob), 4))

# --- DBSCAN Clustering (Unsupervised) ---
print("\nDBSCAN Clustering")
dbscan_labels = dbscan_model.fit_predict(X_scaled)
print("Cluster Labels:", np.unique(dbscan_labels))

# --- Autoencoder (Unsupervised) ---
print("\nAutoencoder Anomaly Detection")
reconstructions = autoencoder_model.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)

# Threshold (recomputed or hardcoded — example below)
threshold = np.percentile(mse[y == 0], 95)
preds = (mse > threshold).astype(int)

print("Confusion Matrix:")
print(confusion_matrix(y, preds))
print("Classification Report:")
print(classification_report(y, preds))
print("ROC AUC Score:", round(roc_auc_score(y, mse), 4))