In [None]:
!pip install streamlit pyngrok --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
code_blocks = """
import os
import numpy as np
import pandas as pd
import streamlit as st
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score,
    RocCurveDisplay, PrecisionRecallDisplay
)

# ------------------------------
# App
# ------------------------------
st.set_page_config(page_title="Credit Card Fraud Detection", layout="wide")
st.title("🕵️ Credit Card Fraud Detection (SCARF Embeddings + Anomaly Detectors)")

# ------------------------------
# Sidebar (fixed options)
# ------------------------------
st.sidebar.header("🔧 Select Model and Method")
embedding_type = st.sidebar.selectbox("Choose SCARF Embedding", ["InfoNCE", "BarlowTwins", "VICReg"])
detector_type  = st.sidebar.selectbox("Choose Anomaly Detection Method", ["DeepSVDD", "Autoencoder", "IsolationForest"])

uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

# ------------------------------
# Paths (must match training)
# ------------------------------
ARTIFACTS_DIR = "/content/drive/MyDrive/credit_fraud_scarf/artifacts"

ENCODER_PATHS = {
    "InfoNCE":      os.path.join(ARTIFACTS_DIR, "scarf_encoder_InfoNCE.pth"),
    "BarlowTwins":  os.path.join(ARTIFACTS_DIR, "scarf_encoder_BarlowTwins.pth"),
    "VICReg":       os.path.join(ARTIFACTS_DIR, "scarf_encoder_VICReg.pth"),
}

# Map UI name -> your saved lowercase suffixes
EMB_SUFFIX = {"InfoNCE": "infonce", "BarlowTwins": "barlow", "VICReg": "vicreg"}
def get_detector_paths(selected_embedding: str):
    sfx = EMB_SUFFIX[selected_embedding]
    return {
        "DeepSVDD":        os.path.join(ARTIFACTS_DIR, f"deepsvdd_center_{sfx}.npy"),
        "Autoencoder":     os.path.join(ARTIFACTS_DIR, f"ae_best_{sfx}.pth"),
        "IsolationForest": os.path.join(ARTIFACTS_DIR, f"iforest_model_{sfx}.pkl"),
    }

SCALER_CSV     = os.path.join(ARTIFACTS_DIR, "scaling_params.csv")
THRESHOLDS_CSV = os.path.join(ARTIFACTS_DIR, "thresholds.csv")

# ------------------------------
# Models
# ------------------------------
class MLPEncoder(nn.Module):
    # num_layers=2 to match your training checkpoints
    def __init__(self, input_dim=30, hidden_dim=128, num_layers=2, output_dim=128):
        super().__init__()
        layers = []
        for i in range(num_layers):
            in_dim = input_dim if i == 0 else hidden_dim
            layers += [nn.Linear(in_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU()]
        layers.append(nn.Linear(hidden_dim, output_dim))  # final projection used for embeddings
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

class AE(nn.Module):
    # matches your checkpoint: 128 -> 64 -> 128
    def __init__(self, dim):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(dim, 64), nn.ReLU())
        self.decoder = nn.Sequential(nn.Linear(64, dim))
    def forward(self, x):
        return self.decoder(self.encoder(x))

# ------------------------------
# Helpers
# ------------------------------
def must_exist(path, label):
    if not os.path.exists(path):
        st.error(f"Missing {label}: {path}")
        st.stop()

def load_fixed_threshold(thr_csv, emb_name, det_name):
    \"\"\"Fixed threshold only (no fallback), tolerant to common aliases.\"\"\"
    must_exist(thr_csv, "thresholds.csv")
    df = pd.read_csv(thr_csv)

    # normalize headers
    df.columns = [c.strip().lower() for c in df.columns]
    col_emb = "embedding_type" if "embedding_type" in df.columns else ("embedding" if "embedding" in df.columns else None)
    col_met = "method"         if "method" in df.columns         else ("detector" if "detector" in df.columns else None)
    if not col_emb or not col_met or "threshold" not in df.columns:
        st.error("thresholds.csv must contain columns: embedding_type (or embedding), method (or detector), threshold")
        st.stop()

    # alias maps (lowercase)
    embed_map = {"infonce":"infonce","barlow":"barlowtwins","barlowtwins":"barlowtwins","vicreg":"vicreg"}
    method_map = {"deepsvdd":"deepsvdd","deep_svdd":"deepsvdd","svdd":"deepsvdd",
                  "autoencoder":"autoencoder","ae":"autoencoder",
                  "isolationforest":"isolationforest","iforest":"isolationforest","isoforest":"isolationforest"}

    # normalize CSV values
    df[col_emb] = df[col_emb].astype(str).str.strip().str.lower().map(lambda x: embed_map.get(x, x))
    df[col_met] = df[col_met].astype(str).str.strip().str.lower().map(lambda x: method_map.get(x, x))

    # normalize keys from UI
    key_emb = embed_map.get(emb_name.lower().strip(), emb_name.lower().strip())
    key_met = method_map.get(det_name.lower().strip(), det_name.lower().strip())

    row = df[(df[col_emb] == key_emb) & (df[col_met] == key_met)]
    if len(row) != 1:
        with st.expander("Thresholds debug"):
            st.write("Unique embeddings:", sorted(df[col_emb].unique()))
            st.write("Unique methods:", sorted(df[col_met].unique()))
            st.write("Looking for:", {"embedding_type": key_emb, "method": key_met})
            st.dataframe(df[[col_emb, col_met, "threshold"]])
        st.error(f"No fixed threshold found for {emb_name} + {det_name} in thresholds.csv.")
        st.stop()

    with st.expander("Preview thresholds.csv"):
        st.dataframe(pd.read_csv(thr_csv))
        st.write("**Using threshold row:**")
        st.dataframe(row.rename(columns={col_emb:"embedding_type", col_met:"method"})[["embedding_type","method","threshold"]])

    return float(row["threshold"].values[0])

# ------------------------------
# Main flow
# ------------------------------
if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

    # scale Time/Amount if scaler csv exists
    if os.path.exists(SCALER_CSV):
        sc = pd.read_csv(SCALER_CSV)
        for col in ["Time", "Amount"]:
            if col in df.columns and col in sc["column"].values:
                mean = sc.loc[sc["column"] == col, "mean"].values[0]
                std  = sc.loc[sc["column"] == col, "std"].values[0] or 1.0
                df[col] = (df[col] - mean) / std
        st.success("✅ Data uploaded and standardized.")
    else:
        st.info("ℹ️ scaling_params.csv not found — continuing without scaling.")

    st.write("### Preview of Processed Data:")
    st.dataframe(df.head())

    # load encoder
    enc_path = ENCODER_PATHS[embedding_type]
    must_exist(enc_path, f"{embedding_type} encoder")
    encoder = MLPEncoder()
    state = torch.load(enc_path, map_location="cpu")
    encoder.load_state_dict(state, strict=True)
    encoder.eval()
    st.success(f"✅ Loaded SCARF encoder: {embedding_type}")

    # embeddings
    with torch.no_grad():
        X = df.drop(columns=["Class"], errors="ignore").values.astype(np.float32)
        embeddings = encoder(torch.from_numpy(X)).numpy()
    st.success("✅ Embeddings generated.")
    st.write("### Sample Embeddings (first 5 rows):")
    st.dataframe(pd.DataFrame(embeddings[:5]))

    # fixed threshold (no fallback)
    threshold = load_fixed_threshold(THRESHOLDS_CSV, embedding_type, detector_type)

    # detector scoring (uses lowercase suffix paths)
    DETECTOR_PATHS = get_detector_paths(embedding_type)
    det_path = DETECTOR_PATHS[detector_type]
    must_exist(det_path, f"{detector_type} model")

    if detector_type == "IsolationForest":
        clf = joblib.load(det_path)
        scores = -clf.decision_function(embeddings)  # higher = more anomalous

    elif detector_type == "DeepSVDD":
        center = np.load(det_path)
        if center.shape[0] != embeddings.shape[1]:
            st.error(f"DeepSVDD center dim {center.shape[0]} != embedding dim {embeddings.shape[1]}")
            st.stop()
        # squared distance to align with training thresholds
        diff = embeddings - center
        scores = (diff * diff).sum(axis=1)

    elif detector_type == "Autoencoder":
        ae = AE(dim=embeddings.shape[1])
        ae.load_state_dict(torch.load(det_path, map_location="cpu"), strict=True)
        ae.eval()
        with torch.no_grad():
            t = torch.tensor(embeddings, dtype=torch.float32)
            recon = ae(t)
            # ✅ Use SSE per sample (sum of squared error) to match training thresholds
            diff = recon - t
            scores = (diff * diff).sum(dim=1).cpu().numpy()

    # predictions with fixed threshold
    df_results = df.copy()
    df_results["anomaly_score"] = scores
    df_results["is_fraud"] = (df_results["anomaly_score"] > threshold).astype(int)

    st.success(f"✅ Flagged {int(df_results['is_fraud'].sum())} transactions (fixed threshold = {threshold:.6f})")
    st.write("### 🔍 Top-K Flagged Transactions")
    st.dataframe(df_results[df_results["is_fraud"] == 1]
                 .sort_values(by="anomaly_score", ascending=False)
                 .head(10))

    # evaluation (only if labels provided)
    if "Class" in df.columns:
        y_true = df["Class"].values.astype(int)
        y_pred = df_results["is_fraud"].values
        st.subheader("📊 Evaluation Metrics (fixed threshold)")
        st.write(f"**ROC-AUC:** {roc_auc_score(y_true, scores):.4f}")
        st.write(f"**PR-AUC:** {average_precision_score(y_true, scores):.4f}")
        st.write(f"**Precision:** {precision_score(y_true, y_pred, zero_division=0):.4f}")
        st.write(f"**Recall:** {recall_score(y_true, y_pred, zero_division=0):.4f}")
        st.write(f"**F1-score:** {f1_score(y_true, y_pred, zero_division=0):.4f}")

        cm = confusion_matrix(y_true, y_pred)
        fig_cm, ax_cm = plt.subplots()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax_cm)
        ax_cm.set_xlabel("Predicted"); ax_cm.set_ylabel("Actual"); ax_cm.set_title("Confusion Matrix")
        st.pyplot(fig_cm)

        fig_roc, ax_roc = plt.subplots()
        RocCurveDisplay.from_predictions(y_true, scores, ax=ax_roc)
        st.pyplot(fig_roc)

        fig_pr, ax_pr = plt.subplots()
        PrecisionRecallDisplay.from_predictions(y_true, scores, ax=ax_pr)
        st.pyplot(fig_pr)

    # download predictions
    st.download_button(
        "⬇️ Download predictions CSV",
        df_results.to_csv(index=False).encode("utf-8"),
        file_name=f"predictions_{embedding_type}_{detector_type}.csv",
        mime="text/csv",
        use_container_width=True
    )
"""



In [None]:
with open("streamlit_app.py", "w") as f:
    f.write(code_blocks)

In [None]:
# Save your code to Drive
!mkdir -p /content/drive/MyDrive/credit_fraud_scarf
!cp /content/streamlit_app.py /content/drive/MyDrive/credit_fraud_scarf/
print("✅ Saved to /content/drive/MyDrive/credit_fraud_scarf/streamlit_app.py")

✅ Saved to /content/drive/MyDrive/credit_fraud_scarf/streamlit_app.py


In [None]:
!pip install streamlit pyngrok --quiet
from pyngrok import ngrok

In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("Your Token")



In [None]:
# Kill any previous tunnels
!pkill streamlit

# Start the app in the background
!streamlit run /content/drive/MyDrive/credit_fraud_scarf/streamlit_app.py &> /dev/null &

# Wait a moment to let the server start
import time
time.sleep(5)

# Create a public URL using ngrok
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print(f"🚀 Your Streamlit app is live: {public_url}")

🚀 Your Streamlit app is live: NgrokTunnel: "https://e61238f0961f.ngrok-free.app" -> "http://localhost:8501"
