# Setup and installing libraries

In [None]:
!pip uninstall -y umap-learn protobuf
!pip install -q protobuf==3.20.3 umap-learn==0.5.5
!pip install -q scikit-learn==1.4.2 numpy==1.26.4


In [None]:
import umap
print("UMAP installed ")


In [None]:
import psutil

print("RAM used:", psutil.virtual_memory().percent, "%")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import (
    KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering,
    MeanShift, Birch
)
from sklearn.mixture import GaussianMixture
from sklearn.covariance import EllipticEnvelope

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, roc_curve, confusion_matrix
)

import umap
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import os
import time
import random

RANDOM_STATE = 42

def set_seeds(seed=RANDOM_STATE):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seeds()

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


# Load Data and EDA

In [None]:
data_path = "/kaggle/input/creditcardfraud/creditcard.csv"
df = pd.read_csv(data_path)

print("Shape:", df.shape)
display(df.head())

print("\nClass distribution:")
class_counts = df['Class'].value_counts()
print(class_counts)

fraud_rate = 100 * class_counts[1] / len(df)
print("\nFraud rate: {:.5f}%".format(fraud_rate))

display(df.describe().T.head(10))


In [None]:
# Class distribution
plt.figure(figsize=(4,3))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.xticks([0, 1], ['Non-fraud', 'Fraud'])
plt.title("Class distribution")
plt.show()

# Amount distribution (log y-scale)
plt.figure(figsize=(5,4))
sns.histplot(df['Amount'], bins=50, log_scale=(False, True))
plt.title("Transaction amount distribution (log Y)")
plt.show()


# Train/Test split and Scaling

In [None]:
import gc
X = df.drop("Class", axis=1)
y = df["Class"]

# stratified because of class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# Feature Scaling
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Convert back to DataFrame 
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test,  columns=X.columns)

# 4. Convert to float32 to reduce memory by ~50%
X_train = X_train.astype("float32")
X_test  = X_test.astype("float32")


# 5. Free unused memory
del df, X   # release original raw dataframe and temp variable
gc.collect()

print("Freed raw dataset memory")
print("Ready for smart sampling")


In [None]:

# Create folder for saved data
os.makedirs("/kaggle/working/saved_data", exist_ok=True)

# Save processed train/test sets
X_train.to_csv("/kaggle/working/saved_data/X_train.csv", index=False)
X_test.to_csv("/kaggle/working/saved_data/X_test.csv", index=False)
y_train.to_csv("/kaggle/working/saved_data/y_train.csv", index=False)
y_test.to_csv("/kaggle/working/saved_data/y_test.csv", index=False)




# Smart sampling( for heavy algorithms )

In [None]:
def fixed_stratified_sample(X, y, n):
    # y must be a Series
    df = X.copy()
    df["label"] = y.values

    # stratified sample
    sample_df = df.groupby("label", group_keys=False).apply(
        lambda x: x.sample(int(n * len(x) / len(df)), random_state=42)
    )

    y_out = sample_df["label"]
    X_out = sample_df.drop(columns=["label"])

    return X_out.reset_index(drop=True), y_out.reset_index(drop=True)


In [None]:
MAX_DR = 10000
MAX_LOF = 8000
MAX_TSNE = 6000

X_train_dr, y_train_dr = fixed_stratified_sample(X_train, y_train, MAX_DR)
X_train_lof, y_train_lof = fixed_stratified_sample(X_train, y_train, MAX_LOF)
X_train_tsne, y_train_tsne = fixed_stratified_sample(X_train, y_train, MAX_TSNE)

print(X_train_dr.shape)
print(X_train_lof.shape)
print(X_train_tsne.shape)


# Dimensionality Reduction Techniques:-

In [None]:
import gc

for var in [
    "X_train_pca2", "X_train_pca10",
    "X_train_umap2", "X_train_tsne2",
    "X_train_dr", "X_train_lof", "X_train_tsne"
]:
    if var in globals():
        del globals()[var]

gc.collect()

print("üßπ Clean slate: ready for correct sampling")


## PCA 

In [None]:
from sklearn.decomposition import PCA

# 2D PCA for visualization
pca_2 = PCA(n_components=2, random_state=RANDOM_STATE)
X_train_pca2 = pca_2.fit_transform(X_train_dr)

# 10D PCA for models (trained on DR sample only)
pca_10 = PCA(n_components=10, random_state=RANDOM_STATE)
X_train_pca10 = pca_10.fit_transform(X_train_dr)

# Transform test set using same PCA
X_test_pca10 = pca_10.transform(X_test)

print("PCA 2D shape:", X_train_pca2.shape)
print("PCA 10D train shape:", X_train_pca10.shape)
print("PCA 10D test shape:", X_test_pca10.shape)


## UMAP

In [None]:
umap_model = umap.UMAP(
    n_components=2,
    n_neighbors=30,
    min_dist=0.1,
    metric="euclidean",
    random_state=RANDOM_STATE,
    low_memory=True
)

X_train_umap2 = umap_model.fit_transform(X_train_dr)

print("UMAP shape:", X_train_umap2.shape)


## tsne

In [None]:
tsne = TSNE(
    n_components=2,
    perplexity=30,        # stable for 6000 samples
    learning_rate="auto",
    n_iter=400,           # reduced from default 1000
    init="pca",           # VERY important for speed
    method="barnes_hut",  # faster O(N log N)
    angle=0.5,            # trades accuracy for speed, acceptable for viz
    random_state=RANDOM_STATE,
)

X_train_tsne2 = tsne.fit_transform(X_train_tsne)

print("t-SNE shape:", X_train_tsne2.shape)


## Visualizations-

In [None]:
os.makedirs("/kaggle/working/github_outputs", exist_ok=True)

def visualize_and_save(embedding, labels, title, filename):
    assert len(embedding) == len(labels), "Embedding and labels size mismatch!"

    plt.figure(figsize=(6,5))

    normal = labels == 0
    fraud = labels == 1

    plt.scatter(
        embedding[normal, 0], embedding[normal, 1],
        s=3, alpha=0.2, c="blue", label="Non-fraud"
    )
    plt.scatter(
        embedding[fraud, 0], embedding[fraud, 1],
        s=40, alpha=0.9, c="red", label="Fraud"
    )

    plt.title(title)
    plt.legend()
    plt.tight_layout()

    # Save figure
    img_path = f"/kaggle/working/github_outputs/{filename}.png"
    plt.savefig(img_path, dpi=300)

    plt.show()

    # Save embedding
    emb_path = f"/kaggle/working/github_outputs/{filename}.npy"
    np.save(emb_path, embedding)

    print(f"‚úÖ Image saved ‚Üí {img_path}")
    print(f"‚úÖ Embedding saved ‚Üí {emb_path}")


In [None]:
visualize_and_save(X_train_pca2,  y_train_dr,  "PCA 2D Visualization",  "pca_2d")
visualize_and_save(X_train_umap2, y_train_dr,  "UMAP 2D Visualization", "umap_2d")
visualize_and_save(X_train_tsne2, y_train_tsne, "t-SNE 2D Visualization", "tsne_2d")


# Standard evaluations

## Score evaluation function

In [None]:
# STEP 5A: Anomaly score evaluation

def evaluate_scores(y_true, anomaly_scores, model_name):
    
    roc = roc_auc_score(y_true, anomaly_scores)
    pr = average_precision_score(y_true, anomaly_scores)

    print(f"{model_name}: ROC-AUC = {roc:.4f}, PR-AUC = {pr:.4f}")
    return {
        "model": model_name,
        "roc_auc": roc,
        "pr_auc": pr
    }


## Cluster to score mapping

In [None]:
def cluster_labels_to_scores(cluster_labels, y_true):
    
    temp = pd.DataFrame({
        "cluster": cluster_labels,
        "label": y_true.values
    })

    fraud_rate = temp.groupby("cluster")["label"].mean()
    mapping = fraud_rate.to_dict()

    scores = np.array([mapping[c] for c in cluster_labels])
    return scores, mapping


# Clustsering Algorithms

## Basic Clusterings

### A. K-means

In [None]:
# Recreate containers (lost after kernel restart)
trained_models = {}
all_metrics = []

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=5,
    random_state=RANDOM_STATE,
    n_init=10
)

kmeans.fit(X_train_pca10)

# Predict on train sample
train_clusters = kmeans.predict(X_train_pca10)

# Convert clusters ‚Üí fraud-score using helper function
scores_kmeans, _ = cluster_labels_to_scores(train_clusters, y_train_dr)

# Evaluate using your evaluation function
evaluate_scores(y_train_dr, scores_kmeans, "KMeans_PCA10")




### B. Gaussian mixture models and expectation maximization

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(
    n_components=5,
    covariance_type="full",
    random_state=RANDOM_STATE
)

gmm.fit(X_train_pca10)

train_clusters_gmm = gmm.predict(X_train_pca10)
test_clusters_gmm  = gmm.predict(X_test_pca10)

trained_models["gmm_pca10"] = gmm

scores_train_gmm, mapping_gmm = cluster_labels_to_scores(train_clusters_gmm, y_train_dr)
scores_test_gmm, _ = cluster_labels_to_scores(test_clusters_gmm, y_test)

evaluate_scores(y_test, scores_test_gmm, "GMM_PCA10")



## Advanced Clustering 

In [None]:
# Recreate containers (lost after kernel restart)
trained_models = {}
all_metrics = []



### A. DBSCAN on UMAP-2

In [None]:
dbscan = DBSCAN(
    eps=0.5,
    min_samples=10,
    n_jobs=-1
)

db_labels = dbscan.fit_predict(X_train_umap2)

# Convert cluster labels to anomaly scores
scores_db, mapping_db = cluster_labels_to_scores(db_labels, y_train_dr)

evaluate_scores(y_train_dr, scores_db, "DBSCAN_UMAP")

trained_models["dbscan_umap"] = dbscan


###  B. Spectral clustering

In [None]:
# Graph is not fully connected, spectral embedding may not work
# Tried many ways permutation and combination search for reasons why it didi not work

In [None]:
import os
import numpy as np
import pandas as pd
import joblib

SAVE_DIR = "/kaggle/working/github_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

print("üìÅ Saving all dimensionality reduction & clustering outputs...")

# 1. Save DR embeddings
np.save(f"{SAVE_DIR}/pca_2d.npy", X_train_pca2)
np.save(f"{SAVE_DIR}/pca_10d_train.npy", X_train_pca10)
np.save(f"{SAVE_DIR}/pca_10d_test.npy", X_test_pca10)

np.save(f"{SAVE_DIR}/umap_2d.npy", X_train_umap2)
np.save(f"{SAVE_DIR}/tsne_2d.npy", X_train_tsne2)

# 2. Save smart sampling indices
np.save(f"{SAVE_DIR}/idx_dr.npy", idx_dr)
np.save(f"{SAVE_DIR}/idx_lof.npy", idx_lof)
np.save(f"{SAVE_DIR}/idx_tsne.npy", idx_tsne)

# 3. Save clustering labels if they exist
try:
    np.save(f"{SAVE_DIR}/kmeans_labels.npy", kmeans_labels)
except:
    pass

try:
    np.save(f"{SAVE_DIR}/gmm_labels.npy", gmm_labels)
except:
    pass

try:
    np.save(f"{SAVE_DIR}/dbscan_labels.npy", dbscan_labels)
except:
    pass

try:
    np.save(f"{SAVE_DIR}/spec_labels.npy", spec_labels)
except:
    pass

# 4. Save clustering models if they exist
try:
    joblib.dump(kmeans, f"{SAVE_DIR}/kmeans_model.pkl")
except:
    pass

try:
    joblib.dump(gmm, f"{SAVE_DIR}/gmm_model.pkl")
except:
    pass

# 5. Save metrics collected till now
try:
    metrics_df = pd.DataFrame(all_metrics)
    metrics_df.to_csv(f"{SAVE_DIR}/clustering_metrics.csv", index=False)
except:
    pass

print("‚úÖ All results saved successfully to github_outputs/")


In [None]:
import gc

for var in [
    "X_train_dr", "X_train_lof", "X_train_tsne",
    "X_train_pca2", "X_train_umap2"
]:
    if var in globals():
        del globals()[var]

gc.collect()

print("üßπ Cleanup done. Safe to start anomaly detection.")


# Anamoly Detection:-

### Isolation Forest

In [None]:
iso = IsolationForest(
    n_estimators=200,
    contamination="auto",
    random_state=RANDOM_STATE,
    n_jobs=-1
)
iso.fit(X_train)

trained_models["isolation_forest"] = iso

iso_test_scores = -iso.decision_function(X_test)

metrics = evaluate_scores(y_test, iso_test_scores, "IsolationForest")
all_metrics.append(metrics)


### Local outlier factor

In [None]:
lof = LocalOutlierFactor(
    n_neighbors=40,
    novelty=True,
    n_jobs=-1
)

lof.fit(X_train_lof)   # sampled set

trained_models["lof"] = lof

lof_test_scores = -lof.decision_function(X_test)

metrics = evaluate_scores(y_test, lof_test_scores, "LOF_sampled")
all_metrics.append(metrics)


### one class SVM 

In [None]:
X_train_pca10_lof = X_train_pca10[idx_lof]

ocsvm = OneClassSVM(
    kernel='rbf',
    nu=0.01,
    gamma='scale'
)
ocsvm.fit(X_train_pca10_lof)

trained_models["oneclass_svm"] = ocsvm

ocsvm_test_scores = -ocsvm.decision_function(X_test_pca10)

metrics = evaluate_scores(y_test, ocsvm_test_scores, "OneClassSVM")
all_metrics.append(metrics)


In [None]:
# Correct PCA step (must be done BEFORE sampling)

pca_10 = PCA(n_components=10, random_state=RANDOM_STATE)
X_train_pca10 = pca_10.fit_transform(X_train)
X_test_pca10  = pca_10.transform(X_test)

# Now LOF sampling works safely
X_train_pca10_lof = X_train_pca10[idx_lof]


In [None]:
X_train_pca10_lof = X_train_pca10[idx_lof]


#### Elliptic Envelop

In [None]:
ell = EllipticEnvelope(
    contamination=0.0017,
    random_state=RANDOM_STATE
)
ell.fit(X_train_pca10)

trained_models["elliptic_envelope"] = ell

ell_test_scores = -ell.decision_function(X_test_pca10)

metrics = evaluate_scores(y_test, ell_test_scores, "EllipticEnvelope")
all_metrics.append(metrics)


## Saving metrics

In [None]:
# Combine all metrics into a dataframe
metrics_df = pd.DataFrame(all_metrics)
metrics_df.sort_values(by="pr_auc", ascending=False, inplace=True)
metrics_df.reset_index(drop=True, inplace=True)

metrics_df


In [None]:
import joblib, zipfile, os

BASE = "/kaggle/working"

# 1. Save all trained models
for name, model in trained_models.items():
    joblib.dump(model, f"{BASE}/{name}.pkl")

# 2. Save final metrics
metrics_df.to_csv(f"{BASE}/anomaly_detection_results.csv", index=False)

# 3. Zip everything for GitHub
zip_path = f"{BASE}/anomaly_detection_outputs.zip"
with zipfile.ZipFile(zip_path, "w") as zipf:
    for file in os.listdir(BASE):
        if file.endswith(".pkl") or file.endswith(".csv"):
            zipf.write(os.path.join(BASE, file), arcname=file)

zip_path


In [None]:
# Clear unnecessary objects to free RAM
import gc

del X_train_lof
del y_train_lof
del X_train_dr
del y_train_dr
del X_train_tsne
del y_train_tsne

gc.collect()


In [None]:
# RAM check
import psutil

def check_ram():
    ram_gb = psutil.virtual_memory().used / (1024**3)
    print(f"RAM Used: {ram_gb:.2f} GB")

check_ram()


# Supervised Baseline model

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    n_jobs=-1
)
log_reg.fit(X_train, y_train)

trained_models["logistic_regression"] = log_reg

log_proba = log_reg.predict_proba(X_test)[:, 1]

metrics = evaluate_scores(y_test, log_proba, "LogisticRegression")
all_metrics.append(metrics)


# Compare models and plots

In [None]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df.sort_values(by="pr_auc", ascending=False, inplace=True)
metrics_df.reset_index(drop=True, inplace=True)
metrics_df

In [None]:
def plot_pr(models_dict):
    plt.figure(figsize=(7,6))
    
    for name, scores in models_dict.items():
        prec, rec, _ = precision_recall_curve(y_test, scores)
        plt.plot(rec, prec, label=name)
    
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curves")
    plt.legend()
    plt.show()

scores_dict = {
    "IsolationForest": iso_test_scores,
    "LOF": lof_test_scores,
    "OneClassSVM": ocsvm_test_scores,
    "LogReg": log_proba
}

plot_pr(scores_dict)


# Final save all models and results

In [None]:
# FINAL SAVE CELL

import joblib, zipfile, os

BASE = "/kaggle/working"

# 1. Save models
for name, model in trained_models.items():
    if "autoencoder" in name:
        torch.save(model.state_dict(), f"{BASE}/{name}.pt")
    else:
        joblib.dump(model, f"{BASE}/{name}.pkl")

# 2. Save metrics
metrics_df.to_csv(f"{BASE}/final_metrics.csv", index=False)

# 3. Zip everything
zip_path = f"{BASE}/credit_fraud_outputs.zip"
with zipfile.ZipFile(zip_path, 'w') as z:
    for file in os.listdir(BASE):
        if file.endswith(".pkl") or file.endswith(".pt") or file.endswith(".csv"):
            z.write(os.path.join(BASE, file), arcname=file)

zip_path
