In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
#imporing the neccecery lib
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
from torch_geometric.nn import GATConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time


In [3]:
# Seed setting for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()


In [4]:
# Attention module for feature enhancement
class FeatureAttention(nn.Module):
    def __init__(self, in_features):
        super(FeatureAttention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_features, in_features // 2),
            nn.LeakyReLU(0.2),
            nn.Linear(in_features // 2, in_features),
            nn.Sigmoid()
        )

    def forward(self, x):
        att_weights = self.attention(x)
        return x * att_weights


In [5]:
# Hybrid GNN-VAE-GRU Model
class EnhancedHybridGNNVAEGRU(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, latent_dim, heads=4, dropout=0.3):
        super(EnhancedHybridGNNVAEGRU, self).__init__()

        # Feature attention
        self.feature_attention = FeatureAttention(in_channels)

        # Multi-layer GNN with residual connections
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.conv2 = GCNConv(hidden_channels * heads, hidden_channels * heads)
        self.conv3 = GraphConv(hidden_channels * heads, out_channels)

        # Normalization layers
        self.norm1 = nn.LayerNorm(hidden_channels * heads)
        self.norm2 = nn.LayerNorm(hidden_channels * heads)
        self.norm3 = nn.LayerNorm(out_channels)

        # VAE components
        self.fc_mu = nn.Sequential(
            nn.Linear(out_channels, latent_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout),
            nn.Linear(latent_dim * 2, latent_dim)
        )

        self.fc_logvar = nn.Sequential(
            nn.Linear(out_channels, latent_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout),
            nn.Linear(latent_dim * 2, latent_dim)
        )

        # Bidirectional GRU for temporal modeling
        self.gru = nn.GRU(
            latent_dim,
            latent_dim,
            num_layers=3,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )

        # MLP for final prediction
        self.mlp = nn.Sequential(
            nn.Linear(latent_dim * 2, latent_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout),
            nn.Linear(latent_dim, latent_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Linear(latent_dim // 2, 1)
        )

        # Auxiliary classifier
        self.aux_classifier = nn.Sequential(
            nn.Linear(out_channels, latent_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout),
            nn.Linear(latent_dim, 1)
        )

        # Reconstruction decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, latent_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout),
            nn.Linear(latent_dim * 2, out_channels),
            nn.LeakyReLU(0.2),
            nn.Linear(out_channels, in_channels)
        )

    def encode(self, x, edge_index, edge_attr):
        # Apply feature attention
        x = self.feature_attention(x)

        # First graph convolution
        x1 = self.norm1(F.leaky_relu(self.conv1(x, edge_index, edge_attr), 0.2))

        # Second graph convolution with residual connection
        x2 = self.norm2(F.leaky_relu(self.conv2(x1, edge_index), 0.2) + x1)

        # Third graph convolution
        x3 = self.norm3(F.leaky_relu(self.conv3(x2, edge_index), 0.2))

        # VAE encoding
        mu = self.fc_mu(x3)
        logvar = self.fc_logvar(x3)

        return x3, mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + eps * std
        else:
            return mu

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x, edge_index, edge_attr, h=None):
        # Encode
        x_encoded, mu, logvar = self.encode(x, edge_index, edge_attr)

        # Auxiliary prediction directly from encoded features
        aux_out = self.aux_classifier(x_encoded)

        # Reparameterize
        z = self.reparameterize(mu, logvar)

        # Decode for reconstruction
        x_recon = self.decode(z)

        # GRU processing
        z_sequence = z.unsqueeze(1)  # Add sequence dimension

        if h is None:
            # Initialize hidden state if not provided
            h = torch.zeros(2 * 3, z.size(0), mu.size(1), device=x.device)
             # 2 for bidirectional, 3 for num_layers

        gru_out, h = self.gru(z_sequence, h)

        # Final MLP prediction
        main_out = self.mlp(gru_out.squeeze(1))

        return main_out, aux_out, x_recon, h, mu, logvar


In [6]:
#loss function

def enhanced_loss_function(main_pred, aux_pred, x_recon, x_orig, target, mu, logvar,
                           gamma=2.0, alpha=0.25, recon_weight=0.2, kld_weight=0.1, aux_weight=0.3):
    # Binary cross entropy with logits for main prediction with focal loss
    pt = torch.sigmoid(main_pred.squeeze())
    pt = torch.where(target.float() == 1, pt, 1-pt)
    focal_weight = alpha * torch.pow(1 - pt, gamma)
    # Detach focal_weight to make it non-differentiable
    focal_weight = focal_weight.detach()
    main_bce = F.binary_cross_entropy_with_logits(
        main_pred.squeeze(),
        target.float(),
        weight=focal_weight,
        reduction='mean'
    )

    # Auxiliary prediction loss
    aux_bce = F.binary_cross_entropy_with_logits(
        aux_pred.squeeze(),
        target.float(),
        weight=focal_weight,
        reduction='mean'
    )

    # Reconstruction loss
    recon_loss = F.mse_loss(x_recon, x_orig, reduction='mean')

    # KL divergence
    kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())

    # Total loss
    total_loss = main_bce + aux_weight * aux_bce + recon_weight * recon_loss + kld_weight * kld

    return total_loss, main_bce, aux_bce, recon_loss, kld


In [7]:
# Load and preprocess data
def load_creditcard_data(path='creditcard.csv'):
    print("Loading and preprocessing data...")
    df = pd.read_csv(path)
    df.dropna(inplace=True)

    # Keep Time as a feature after normalization
    time_scaler = StandardScaler()
    df['NormTime'] = time_scaler.fit_transform(df[['Time']])

    # Use RobustScaler for Amount to handle outliers better
    amount_scaler = RobustScaler()
    df['Amount'] = amount_scaler.fit_transform(df[['Amount']])

    # Feature engineering: add hour of day as cyclical feature
    df['Hour'] = df['Time'] / 3600 % 24
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)

    # Add interaction features for V columns
    df['V1_V3'] = df['V1'] * df['V3']
    df['V4_V12'] = df['V4'] * df['V12']
    df['V14_V17'] = df['V14'] * df['V17']

    # Drop original Time column
    df.drop(columns=['Time', 'Hour'], inplace=True)

    # Scale all features
    feature_cols = [col for col in df.columns if col != 'Class']
    features = df[feature_cols].values

    # Standardize all features
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    labels = df['Class'].values.astype(int)
    print(f"Data loaded: {features.shape[0]} transactions, {features.shape[1]} features")
    print(f"Fraud cases: {np.sum(labels)} ({np.mean(labels)*100:.4f}%)")

    return features, labels, feature_cols


In [8]:
# Create dynamic graph from features using distance and similarity
def create_graph_from_batch(features, labels, k=10, epsilon=0.5):
    # Compute cosine similarity
    sim = cosine_similarity(features)

    # Create edges based on similarity threshold and k-nearest neighbors
    edge_index = []
    edge_weights = []

    for i in range(sim.shape[0]):
        # Get indices of top-k similar nodes
        neighbors_idx = sim[i].argsort()[-k-1:-1]

        for j in neighbors_idx:
            # Add edge if similarity is above threshold
            if sim[i, j] > epsilon:
                edge_index.append([i, j])
                edge_weights.append([sim[i, j]])  # Use similarity as edge weight

    # If no edges above threshold, fall back to basic k-nn
    if len(edge_index) == 0:
        for i in range(sim.shape[0]):
            neighbors_idx = sim[i].argsort()[-k-1:-1]
            for j in neighbors_idx:
                edge_index.append([i, j])
                edge_weights.append([sim[i, j]])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_weights, dtype=torch.float)
    x = torch.tensor(features, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr), torch.tensor(labels)


In [9]:
# Compute anomaly scores with ensemble approach
def compute_anomaly_scores(main_pred, aux_pred, recon_error, mu, logvar, alpha=0.6, beta=0.2, gamma=0.2):
    # Convert predictions to probabilities
    main_prob = torch.sigmoid(main_pred).squeeze()
    aux_prob = torch.sigmoid(aux_pred).squeeze()

    # Compute reconstruction probability (1 - normalized error)
    recon_prob = 1 - torch.nn.functional.normalize(recon_error, p=2, dim=0)

    # Compute KL divergence
    kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1)
    kl_prob = 1 - torch.nn.functional.normalize(kl_div, p=2, dim=0)

    # Ensemble score
    ensemble_score = alpha * main_prob + beta * aux_prob + gamma * (recon_prob + kl_prob) / 2

    return ensemble_score.detach().cpu().numpy()


In [10]:
# Enhanced evaluation function
def evaluate_model(threshold, main_pred, aux_pred, x_recon, x_orig, mu, logvar, true_labels):
    # Compute reconstruction error
    recon_error = torch.mean((x_recon - x_orig).pow(2), dim=1)

    # Calculate ensemble anomaly scores
    anomaly_scores = compute_anomaly_scores(main_pred, aux_pred, recon_error, mu, logvar)

    # Binary predictions based on threshold
    preds = (anomaly_scores > threshold).astype(int)

    # Calculate metrics
    precision = precision_score(true_labels, preds, zero_division=0)
    recall = recall_score(true_labels, preds, zero_division=0)
    f1 = f1_score(true_labels, preds, zero_division=0)
    auc = roc_auc_score(true_labels, anomaly_scores)
    avg_precision = average_precision_score(true_labels, anomaly_scores)

    return precision, recall, f1, auc, avg_precision, anomaly_scores


In [11]:
# Enhanced training function with early stopping and learning rate scheduling
def train(model, optimizer, features, labels, batch_size=128, epochs=50, patience=10, device='cpu'):
    model.train()
    model = model.to(device)

    # Create scheduler for learning rate reduction
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

    # Setup early stopping
    best_loss = float('inf')
    no_improve_epochs = 0

    # Training statistics
    train_losses = []

    # Use stratified k-fold for batch creation
    skf = StratifiedKFold(n_splits=int(len(labels) / batch_size), shuffle=True, random_state=42)

    print(f"Starting training for {epochs} epochs...")
    start_time = time.time()

    for epoch in range(epochs):
        epoch_loss = 0.0
        epoch_main_loss = 0.0
        epoch_aux_loss = 0.0
        epoch_recon_loss = 0.0
        epoch_kld_loss = 0.0
        batch_count = 0

        for _, batch_idx in skf.split(features, labels):
            batch_features = features[batch_idx]
            batch_labels = labels[batch_idx]

            # Create graph data
            graph_data, true_labels = create_graph_from_batch(batch_features, batch_labels)

            # Move data to device
            graph_data = graph_data.to(device)
            true_labels = true_labels.to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            main_out, aux_out, x_recon, _, mu, logvar = model(
                graph_data.x, graph_data.edge_index, graph_data.edge_attr
            )

            # Compute loss
            loss, main_loss, aux_loss, recon_loss, kld = enhanced_loss_function(
                main_out, aux_out, x_recon, graph_data.x, true_labels, mu, logvar
            )

            # Backward pass and optimize
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Accumulate losses
            epoch_loss += loss.item()
            epoch_main_loss += main_loss.item()
            epoch_aux_loss += aux_loss.item()
            epoch_recon_loss += recon_loss.item()
            epoch_kld_loss += kld.item()
            batch_count += 1

        # Calculate average losses
        avg_epoch_loss = epoch_loss / batch_count if batch_count > 0 else 0
        avg_main_loss = epoch_main_loss / batch_count if batch_count > 0 else 0
        avg_aux_loss = epoch_aux_loss / batch_count if batch_count > 0 else 0
        avg_recon_loss = epoch_recon_loss / batch_count if batch_count > 0 else 0
        avg_kld_loss = epoch_kld_loss / batch_count if batch_count > 0 else 0

        # Update learning rate
        scheduler.step(avg_epoch_loss)

        # Print progress
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Avg Loss: {avg_epoch_loss:.4f}, "
                  f"Main Loss: {avg_main_loss:.4f}, Aux Loss: {avg_aux_loss:.4f}, "
                  f"Recon Loss: {avg_recon_loss:.4f}, KLD: {avg_kld_loss:.4f}")

        train_losses.append(avg_epoch_loss)

        # Check for early stopping
        if avg_epoch_loss < best_loss:
            best_loss = avg_epoch_loss
            torch.save(model.state_dict(), 'best_model.pt')
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")

    # Load the best model
    model.load_state_dict(torch.load('best_model.pt'))
    return model, train_losses


In [12]:
def evaluate(model, features, labels, batch_size=128, device='cpu'):
    model.eval()
    model = model.to(device)

    print("\nEvaluating model...")

    # Setup stratified k-fold for consistent batching
    skf = StratifiedKFold(n_splits=max(1, int(len(labels) / batch_size)), shuffle=False)

    all_main_outputs = []
    all_aux_outputs = []
    all_recon_outputs = []
    all_originals = []
    all_mus = []
    all_logvars = []
    all_labels = []

    with torch.no_grad():
        for _, batch_idx in skf.split(features, labels):
            batch_features = features[batch_idx]
            batch_labels = labels[batch_idx]

            # Create graph data
            graph_data, true_labels = create_graph_from_batch(batch_features, batch_labels)

            # Move data to device
            graph_data = graph_data.to(device)
            true_labels = true_labels.to(device)

            # Forward pass
            main_out, aux_out, x_recon, _, mu, logvar = model(
                graph_data.x, graph_data.edge_index, graph_data.edge_attr
            )

            # Collect all outputs
            all_main_outputs.append(main_out)
            all_aux_outputs.append(aux_out)
            all_recon_outputs.append(x_recon)
            all_originals.append(graph_data.x)
            all_mus.append(mu)
            all_logvars.append(logvar)
            all_labels.extend(true_labels.cpu().numpy())

    # Concatenate all results
    if all_main_outputs:
        all_main_outputs = torch.cat(all_main_outputs, dim=0)
        all_aux_outputs = torch.cat(all_aux_outputs, dim=0)
        all_recon_outputs = torch.cat(all_recon_outputs, dim=0)
        all_originals = torch.cat(all_originals, dim=0)
        all_mus = torch.cat(all_mus, dim=0)
        all_logvars = torch.cat(all_logvars, dim=0)
        all_labels = np.array(all_labels)

        # Find optimal threshold
        anomaly_scores = compute_anomaly_scores(
            all_main_outputs, all_aux_outputs,
            torch.mean((all_recon_outputs - all_originals).pow(2), dim=1),
            all_mus, all_logvars
        )

        # Try multiple thresholds to find optimal
        best_f1 = 0
        best_threshold = 0
        best_results = None

        for percentile in range(90, 99):
            threshold = np.percentile(anomaly_scores, percentile)
            results = evaluate_model(
                threshold, all_main_outputs, all_aux_outputs,
                all_recon_outputs, all_originals,
                all_mus, all_logvars, all_labels
            )

            if results[2] > best_f1:  # F1 score is at index 2
                best_f1 = results[2]
                best_threshold = threshold
                best_results = results

        # Report results with best threshold
        print(f"\n--- Final Evaluation Results ---")
        print(f"Optimal Threshold: {best_threshold:.6f}")
        print(f"Precision: {best_results[0]:.4f}")
        print(f"Recall: {best_results[1]:.4f}")
        print(f"F1-score: {best_results[2]:.4f}")
        print(f"AUC: {best_results[3]:.4f}")
        print(f"Average Precision: {best_results[4]:.4f}")


In [13]:
# Main execution
if __name__ == "__main__":
    set_seed()

    # Check for GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data with improved preprocessing
    features, labels, feature_names = load_creditcard_data()

    # Split data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=42, stratify=labels
    )

    print(f"Training data: {X_train.shape[0]} samples")
    print(f"Test data: {X_test.shape[0]} samples")
    print(f"Fraud cases in training: {np.sum(y_train)} ({np.mean(y_train)*100:.2f}%)")
    print(f"Fraud cases in test: {np.sum(y_test)} ({np.mean(y_test)*100:.2f}%)")

    # Initialize enhanced model
    model = EnhancedHybridGNNVAEGRU(
        in_channels=X_train.shape[1],
        hidden_channels=64,
        out_channels=32,
        latent_dim=16,
        heads=4,
        dropout=0.3
    )

    # Use Adam optimizer with weight decay for regularization
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    # Training
    print("\n--- Training Started ---")
    model, losses = train(model, optimizer, X_train, y_train, batch_size=128, epochs=50, device=device)

    # Evaluation on test set
    print("\n--- Evaluating on Test Set ---")
    test_results = evaluate(model, X_test, y_test, batch_size=128, device=device)

Using device: cpu
Loading and preprocessing data...
Data loaded: 284807 transactions, 35 features
Fraud cases: 492 (0.1727%)
Training data: 227845 samples
Test data: 56962 samples
Fraud cases in training: 394 (0.17%)
Fraud cases in test: 98 (0.17%)

--- Training Started ---
Starting training for 50 epochs...




Epoch 1/50, Avg Loss: 0.1944, Main Loss: 0.0012, Aux Loss: 0.0007, Recon Loss: 0.9600, KLD: 0.0104




Epoch 5/50, Avg Loss: 0.1821, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8805, KLD: 0.0550




Epoch 10/50, Avg Loss: 0.1806, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8724, KLD: 0.0567




Epoch 15/50, Avg Loss: 0.1803, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8695, KLD: 0.0589




Epoch 20/50, Avg Loss: 0.1762, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8444, KLD: 0.0685




Epoch 25/50, Avg Loss: 0.1752, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8394, KLD: 0.0685




Epoch 30/50, Avg Loss: 0.1744, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8353, KLD: 0.0691




Epoch 35/50, Avg Loss: 0.1737, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8314, KLD: 0.0695




Epoch 40/50, Avg Loss: 0.1731, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8280, KLD: 0.0707




Epoch 45/50, Avg Loss: 0.1729, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8263, KLD: 0.0711




Epoch 50/50, Avg Loss: 0.1727, Main Loss: 0.0004, Aux Loss: 0.0003, Recon Loss: 0.8251, KLD: 0.0720
Training completed in 3589.66 seconds

--- Evaluating on Test Set ---

Evaluating model...





--- Final Evaluation Results ---
Optimal Threshold: 0.312348
Precision: 0.0737
Recall: 0.8571
F1-score: 0.1357
AUC: 0.9518
Average Precision: 0.6337
