In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from sklearn.preprocessing import StandardScaler
import time
import gc
from scipy import sparse

"""
Financial Transaction Feature Engineering Pipeline

This script performs high-performance, vectorized feature engineering on financial 
transaction data. It generates:
1. Basic aggregation features (sum, mean, std, counts).
2. Advanced temporal features (linear trends, volatility, density).
3. Network/Graph features (PageRank, degrees) using sparse matrices.
4. Interaction features (ratios, net flows).
5. Registered/Whitelist features (Static connection counts and transactional verification).

The output is a normalized PyTorch tensor suitable for Graph Neural Networks 
or Tabular models.
"""

# Configuration
pd.options.mode.chained_assignment = None

def fast_pagerank(A, d=0.85, tol=1e-4, max_iter=20):
    """
    Calculates PageRank using a sparse adjacency matrix and power iteration.
    
    Args:
        A (scipy.sparse.coo_matrix): Adjacency matrix of the graph.
        d (float): Damping factor.
        tol (float): Convergence tolerance.
        max_iter (int): Maximum number of iterations.
        
    Returns:
        np.array: PageRank scores for all nodes.
    """
    n = A.shape[0]
    
    # Normalize matrix: Row stochastic
    out_degree = np.array(A.sum(axis=1)).flatten()
    out_degree[out_degree == 0] = 1.0 
    D_inv = sparse.diags(1.0 / out_degree)
    M = D_inv @ A 
    
    # Initialization
    r = np.ones(n) / n
    teleport = np.ones(n) / n
    
    for _ in range(max_iter):
        r_new = d * (M.T @ r) + (1 - d) * teleport
        if np.linalg.norm(r_new - r, 1) < tol:
            break
        r = r_new
    return r

### --- 0. GPU Detection ---
print("--- Part 1: Feature Engineering ---")
print("Step 0: Checking for GPU...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU '{torch.cuda.get_device_name(0)}' is available.")
else:
    print("No GPU found. Running on CPU.")

# --- 1. Data Loading ---
print("\nStep 1: Loading data...")
try:
    # Attempt to use pyarrow engine for faster IO, fallback to standard C engine on failure
    try:
        df_trans = pd.read_csv("data/acct_transaction.csv", dtype={'from_acct': str, 'to_acct': str}, engine='pyarrow')
        df_alert = pd.read_csv("data/acct_alert.csv", dtype={'acct': str}, engine='pyarrow')
        # [NEW] Load Register Data
        df_reg = pd.read_csv("data/acct_register.csv", dtype={'from_acct': str, 'to_acct': str}, engine='pyarrow')
    except:
        print("Pyarrow engine not found, using default...")
        df_trans = pd.read_csv("data/acct_transaction.csv", dtype={'from_acct': str, 'to_acct': str})
        df_alert = pd.read_csv("data/acct_alert.csv", dtype={'acct': str})
        # [NEW] Load Register Data
        df_reg = pd.read_csv("data/acct_register.csv", dtype={'from_acct': str, 'to_acct': str})
        
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    exit()

# --- 2. Feature Engineering ---
print("\nStep 2: Performing vectorized feature engineering...")

# Pre-processing temporal columns
df_trans['txn_time'] = pd.to_datetime(df_trans['txn_time'], format='%H:%M:%S', errors='coerce')
df_trans['txn_hour'] = df_trans['txn_time'].dt.hour
df_trans['is_late_night'] = ((df_trans['txn_hour'] >= 0) & (df_trans['txn_hour'] < 6)).astype(int)
df_trans['txn_date'] = pd.to_datetime(df_trans['txn_date'], errors='coerce')

# Convert dates to numeric index 
# Note: Aligning with register data where Day 1 is the start.
min_date = df_trans['txn_date'].min()
# We add +1 so the first day is 1, matching the register file logic
df_trans['day_idx'] = (df_trans['txn_date'] - min_date).dt.days + 1 

# One-hot encoding and logic checks
df_trans = pd.get_dummies(df_trans, columns=['channel_type'], prefix='channel', dtype=int)
df_trans['is_round_number_txn'] = (df_trans['txn_amt'] % 1000 == 0).astype(int)

# ==========================================
# [NEW] Processing Register (Whitelist) Logic
# ==========================================
print("Processing Register (Whitelist) Features...")

# 1. Clean Register Data (Clamp dates as per instructions)
max_txn_day = df_trans['day_idx'].max()
df_reg['start_date'] = pd.to_numeric(df_reg['start_date'], errors='coerce').fillna(-1).astype(int)
df_reg['end_date'] = pd.to_numeric(df_reg['end_date'], errors='coerce').fillna(999).astype(int)

# Logic: start_date < 1 -> -1, end_date > max -> 999
df_reg.loc[df_reg['start_date'] < 1, 'start_date'] = -1
df_reg.loc[df_reg['end_date'] > max_txn_day, 'end_date'] = 999

# 2. Static Register Features (Trust Graph Structure)
# Count how many beneficiaries this account has registered (Trusted Out-degree)
reg_static_out = df_reg.groupby('from_acct')['to_acct'].nunique().to_frame('num_registered_beneficiaries')
# Count how many people have registered this account (Trusted In-degree)
reg_static_in = df_reg.groupby('to_acct')['from_acct'].nunique().to_frame('num_registered_sources')

# 3. Dynamic Transaction Flagging
# Merge transactions with register data to verify if specific txns are whitelisted
# We perform a left join on (from, to) and then check the date window
print("   -> Merging register data with transactions...")
df_trans_reg = df_trans[['from_acct', 'to_acct', 'day_idx']].merge(
    df_reg[['from_acct', 'to_acct', 'start_date', 'end_date']],
    on=['from_acct', 'to_acct'],
    how='left'
)

# A transaction is registered if the link exists AND the date is within the window
df_trans_reg['is_registered_txn'] = (
    (df_trans_reg['start_date'].notna()) & 
    (df_trans_reg['day_idx'] >= df_trans_reg['start_date']) & 
    (df_trans_reg['day_idx'] <= df_trans_reg['end_date'])
).astype(int)

# Assign flag back to main dataframe and clean up
df_trans['is_registered_txn'] = df_trans_reg['is_registered_txn']
del df_trans_reg, df_reg # Free memory

# --- Vectorized Out-degree Features ---
print("Calculating out-degree features (Vectorized)...")

out_agg_funcs = {
    'txn_amt': ['sum', 'mean', 'std', 'max', 'count'],
    'to_acct': ['nunique'], 
    'day_idx': ['min', 'max', 'nunique'], 
    'is_late_night': ['sum', 'mean'], 
    'is_round_number_txn': ['mean'],
    'is_registered_txn': ['sum', 'mean'] # [NEW] Added registered stats
}

# Dynamically add channel columns to aggregation
for col in df_trans.columns:
    if 'channel_' in col:
        out_agg_funcs[col] = ['sum']

out_features = df_trans.groupby('from_acct').agg(out_agg_funcs)
out_features.columns = ['_'.join(col).strip() for col in out_features.columns.values]

# Calculate account lifespan and rename columns for clarity
out_features['account_lifespan_days'] = out_features['day_idx_max'] - out_features['day_idx_min']
out_features = out_features.rename(columns={
    'txn_amt_sum': 'total_out_amount', 'txn_amt_count': 'total_out_txns', 
    'txn_amt_mean': 'avg_out_amount', 'txn_amt_std': 'std_out_amount', 
    'txn_amt_max': 'max_out_amount', 'to_acct_nunique': 'unique_to_accts', 
    'day_idx_nunique': 'unique_out_txn_days', 'is_late_night_sum': 'late_night_out_txn_count', 
    'is_late_night_mean': 'late_night_out_txn_ratio', 
    'is_round_number_txn_mean': 'round_number_out_txn_ratio',
    'is_registered_txn_sum': 'registered_out_txn_count', # [NEW]
    'is_registered_txn_mean': 'registered_out_txn_ratio' # [NEW]
}).drop(columns=['day_idx_min', 'day_idx_max'])

# --- Vectorized In-degree Features ---
print("Calculating in-degree features (Vectorized)...")
in_features = df_trans.groupby('to_acct').agg({
    'txn_amt': ['sum', 'mean', 'count'], 
    'from_acct': ['nunique']
})
in_features.columns = ['in_' + '_'.join(col).strip() for col in in_features.columns.values]
in_features = in_features.rename(columns={
    'in_txn_amt_sum': 'total_in_amount', 'in_txn_amt_count': 'total_in_txns', 
    'in_txn_amt_mean': 'avg_in_amount', 'in_from_acct_nunique': 'unique_from_accts'
})

# --- Advanced Temporal Features (Linear Regression Slope) ---
print("Extracting advanced temporal features (Vectorized Math)...")

# Calculate Slope using closed-form linear regression formula:
# Slope = (N * Sum(XY) - Sum(X)*Sum(Y)) / (N * Sum(X^2) - Sum(X)^2)
df_trans['xy'] = df_trans['day_idx'] * df_trans['txn_amt']
df_trans['xx'] = df_trans['day_idx'] ** 2

trend_stats = df_trans.groupby('from_acct').agg({
    'txn_amt': ['sum', 'count'],
    'day_idx': ['sum'],
    'xy': ['sum'],
    'xx': ['sum']
})
trend_stats.columns = ['y_sum', 'n', 'x_sum', 'xy_sum', 'xx_sum']

epsilon = 1e-9
numerator = (trend_stats['n'] * trend_stats['xy_sum']) - (trend_stats['x_sum'] * trend_stats['y_sum'])
denominator = (trend_stats['n'] * trend_stats['xx_sum']) - (trend_stats['x_sum'] ** 2)
trend_stats['amt_trend'] = numerator / (denominator + epsilon)

# Activity Density
span = out_features['account_lifespan_days'] + 1
trend_stats['activity_density'] = out_features['unique_out_txn_days'] / span

# Volatility (Coefficient of Variation)
trend_stats['amt_volatility'] = out_features['std_out_amount'] / (out_features['avg_out_amount'] + epsilon)

adv_temporal_features = trend_stats[['amt_trend', 'activity_density', 'amt_volatility']].fillna(0)
del trend_stats, df_trans['xy'], df_trans['xx'] # Memory cleanup

# --- Behavioral Anomaly Z-Scores ---
print("Calculating behavioral anomaly Z-scores (Vectorized)...")
# Merge mean/std back to transactions to calculate deviation for every single transaction
temp_stats = out_features[['avg_out_amount', 'std_out_amount']]
df_trans = df_trans.merge(temp_stats, left_on='from_acct', right_index=True, how='left')
df_trans['txn_zscore'] = (df_trans['txn_amt'] - df_trans['avg_out_amount']) / (df_trans['std_out_amount'].fillna(0) + epsilon)
max_zscore_features = df_trans.groupby('from_acct')['txn_zscore'].max().to_frame(name='max_txn_zscore')

# --- Sparse PageRank Calculation ---
print("Calculating PageRank (Sparse Matrix Method)...")
# Map all unique accounts to integers 0..N
all_accts = pd.unique(np.concatenate([df_trans['from_acct'].astype(str), df_trans['to_acct'].astype(str)]))
acct_map = {acct: i for i, acct in enumerate(all_accts)}
n_nodes = len(all_accts)

# Build Sparse Matrix (COO format)
from_idx = df_trans['from_acct'].map(acct_map).values
to_idx = df_trans['to_acct'].map(acct_map).values
data = np.ones(len(from_idx))
A = sparse.coo_matrix((data, (from_idx, to_idx)), shape=(n_nodes, n_nodes), dtype=np.float32)

pr_scores = fast_pagerank(A)
pagerank_features = pd.DataFrame(pr_scores, index=all_accts, columns=['pagerank'])

# --- Feature Integration ---
print("Integrating all features...")
all_accts_df = pd.DataFrame(index=all_accts)
node_features = all_accts_df.join(out_features)\
                            .join(in_features)\
                            .join(adv_temporal_features)\
                            .join(max_zscore_features)\
                            .join(pagerank_features)\
                            .join(reg_static_out)\
                            .join(reg_static_in) # [NEW] Join static register features

node_features.fillna(0, inplace=True)

# --- Interaction & Ratio Features ---
print("Adding interaction features...")
node_features['net_flow'] = node_features['total_in_amount'] - node_features['total_out_amount']
node_features['in_out_amount_ratio'] = node_features['total_in_amount'] / (node_features['total_out_amount'] + epsilon)
node_features['hub_score'] = node_features['unique_from_accts'] * node_features['unique_to_accts']
node_features['risk_weighted_volume'] = node_features['late_night_out_txn_ratio'] * node_features['total_out_amount']
node_features['volatility_weighted_volume'] = node_features['amt_volatility'] * node_features['total_out_amount']
node_features['pagerank_weighted_flow'] = node_features['pagerank'] * node_features['net_flow']
node_features['avg_daily_out_txns'] = node_features['total_out_txns'] / (node_features['account_lifespan_days'] + epsilon)
node_features['avg_daily_out_amount'] = node_features['total_out_amount'] / (node_features['account_lifespan_days'] + epsilon)
node_features['avg_daily_in_txns'] = node_features['total_in_txns'] / (node_features['account_lifespan_days'] + epsilon)
node_features['anomaly_magnitude'] = node_features['max_txn_zscore'] * node_features['avg_out_amount']

# [NEW] Registered interaction features
# Calculate "Unregistered" (Riskier) volume and counts
node_features['unregistered_out_count'] = node_features['total_out_txns'] - node_features['registered_out_txn_count']
node_features['unregistered_ratio'] = 1.0 - node_features['registered_out_txn_ratio']
# High volatility combined with high unregistered ratio is suspicious
node_features['risk_unregistered_volatility'] = node_features['unregistered_ratio'] * node_features['amt_volatility']

# Final Cleanup
node_features.replace([np.inf, -np.inf], 0, inplace=True)
node_features.fillna(0, inplace=True)
print(f"âœ… Final feature count: {node_features.shape[1]}") 

# Garbage collection to free RAM before scaling
del df_trans, out_features, in_features, adv_temporal_features, max_zscore_features, pagerank_features, A
gc.collect()

# --- 3. Feature Scaling ---
print("\nStep 3: Scaling node features...")
scaler = StandardScaler()
# Convert to float32 to reduce memory footprint
features_val = node_features.values.astype(np.float32)
scaled_features_np = scaler.fit_transform(features_val)
node_features_scaled = pd.DataFrame(scaled_features_np, index=node_features.index, columns=node_features.columns)

# --- 4. Saving Artifacts ---
print("\nStep 4: Saving processed features to file...")
save_path = 'processed_features_with_interactions.pt' 
acct_to_idx = {acct_id: i for i, acct_id in enumerate(node_features_scaled.index)}
feature_tensor = torch.tensor(node_features_scaled.values, dtype=torch.float)

torch.save({
    'feature_tensor': feature_tensor, 
    'acct_to_idx': acct_to_idx, 
    'feature_names': list(node_features.columns),
    'scaler': scaler
}, save_path)
print(f"âœ… Features saved to '{save_path}'")
print("\n--- Completed Successfully ---")

In [None]:
"""
Trains and evaluates a GraphSAGE model for fraud detection.

This script constitutes Part 2 of the pipeline, focusing on model training.
It performs the following steps:
1.Â  Loads node features (from Part 1) and raw data for graph/labels.
2.Â  Prepares a PyTorch Geometric (PyG) `Data` object, including graph
Â  Â  structure, features, labels, and train/val/test splits.
3.Â  Defines a 3-layer GraphSAGE model with batch normalization and dropout.
4.Â  Implements training and testing functions, using a dampened weighted
Â  Â  CrossEntropyLoss to handle class imbalance.
5.Â  Executes the training loop, tracking validation F1-score for
Â  Â  early stopping and model checkpointing.
6.Â  Loads the best-performing model and evaluates it on the hold-out
Â  Â  test set.
7.Â  Generates and displays plots of training metrics (loss, AUC, F1, etc.).
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, confusion_matrix
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

### --- 0. Setup and Data Loading ---
print("--- Part 2: Model Training (Optimized) ---")
print("Step 0: Setup and Data Loading...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

try:
    # Load pre-processed features from Part 1
    saved_data = torch.load('processed_features_with_interactions.pt', map_location=device, weights_only=False)
    feature_tensor = saved_data['feature_tensor']
    acct_to_idx = saved_data['acct_to_idx']
    print("Pre-processed features loaded successfully.")
except FileNotFoundError:
    print("Error: 'processed_features_with_interactions.pt' not found. Please run Part 1 first.")
    exit()

try:
    # Load raw data for graph construction and labels
    df_trans = pd.read_csv("data/acct_transaction.csv", dtype={'from_acct': str, 'to_acct': str})
    df_alert = pd.read_csv("data/acct_alert.csv", dtype={'acct': str})
    print("Raw transaction and alert data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading raw data files: {e}")
    exit()

### --- 1. Prepare Graph Data and Splits ---
print("\nStep 1: Preparing Graph Data and Splits...")
# Filter transactions where both accounts are in our feature map
valid_trans = df_trans[df_trans['from_acct'].isin(acct_to_idx) & df_trans['to_acct'].isin(acct_to_idx)]
source_nodes = valid_trans['from_acct'].map(acct_to_idx).values
target_nodes = valid_trans['to_acct'].map(acct_to_idx).values

# Create edge index and make the graph undirected
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_index = to_undirected(edge_index)
num_nodes = len(acct_to_idx)

# Create labels (y) tensor
y = torch.zeros(num_nodes, dtype=torch.long)
alert_indices_list = [acct_to_idx[acct] for acct in df_alert['acct'] if acct in acct_to_idx]
y[alert_indices_list] = 1

# Create stratified train/validation/test splits
indices = torch.arange(num_nodes)
train_idx, temp_idx, y_train, y_temp = train_test_split(indices, y, train_size=0.7, stratify=y, random_state=42)
val_idx, test_idx, _, _ = train_test_split(temp_idx, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Create boolean masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool); train_mask[train_idx] = True
val_mask = torch.zeros(num_nodes, dtype=torch.bool); val_mask[val_idx] = True
test_mask = torch.zeros(num_nodes, dtype=torch.bool); test_mask[test_idx] = True

# Create the PyG Data object
graph_data = Data(x=feature_tensor, edge_index=edge_index, y=y,
                  train_mask=train_mask, val_mask=val_mask, test_mask=test_mask).to(device)
print("Graph data object created.")

### --- 2. GNN Model Definition ---
class GraphSAGE(nn.Module):
    """
    A 3-layer GraphSAGE model with Batch Normalization and Dropout.

    This model implements a stack of SAGEConv layers for node classification.
    The architecture is:
    SAGEConv -> BatchNorm -> ReLU -> Dropout
    SAGEConv -> BatchNorm -> ReLU -> Dropout
    SAGEConv (linear output)

    Args:
        in_channels (int): Dimensionality of the input node features.
        hidden_channels (int): Dimensionality of the hidden embeddings.
        out_channels (int): Dimensionality of the output (number of classes).
        dropout (float): Dropout probability.

    Attributes:
        conv1 (SAGEConv): The first graph convolutional layer.
        bn1 (nn.BatchNorm1d): Batch normalization for the first hidden layer.
        conv2 (SAGEConv): The second graph convolutional layer.
        bn2 (nn.BatchNorm1d): Batch normalization for the second hidden layer.
        conv3 (SAGEConv): The output graph convolutional layer.
        dropout (nn.Dropout): Dropout layer.
    """
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        """Initializes the GraphSAGE model layers."""
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index):
        """
        Defines the forward pass of the GraphSAGE model.

        Args:
            x (torch.Tensor): The input node feature tensor.
            edge_index (torch.Tensor): The graph's edge index.

        Returns:
            torch.Tensor: The raw output logits for each node.
        """
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x, edge_index)))
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        return x

### --- 3. Training and Evaluation Flow ---
print("\nStep 3: Training and Evaluation...")
def train(model, data, optimizer, loss_fn):
    """
    Performs a single training step on the graph.

    Args:
        model (nn.Module): The GNN model to train.
        data (Data): The PyG graph data object.
        optimizer (torch.optim.Optimizer): The optimizer.
        loss_fn (torch.nn.Module): The loss function.

    Returns:
        float: The training loss for this step.
    """
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(model, data, mask):
    """
    Evaluates the model on a specified node mask (e.g., validation or test).

    This function operates in `torch.no_grad()` mode.

    Args:
        model (nn.Module): The GNN model to evaluate.
        data (Data): The PyG graph data object.
        mask (torch.Tensor): The boolean mask (e.g., `data.val_mask` or
                             `data.test_mask`) specifying which nodes
                             to evaluate.

    Returns:
        tuple: A tuple containing (auc, recall, precision, f1) scores.
    """
    model.eval()
    out = model(data.x, data.edge_index)
    # Get probabilities for the positive class (class 1)
    pred_proba = F.softmax(out[mask], dim=1)[:, 1]
    # Get predicted classes (0 or 1) based on argmax
    pred_class = out[mask].argmax(dim=1)
    y_true = data.y[mask]
    
    # Calculate metrics
    auc = roc_auc_score(y_true.cpu(), pred_proba.cpu())
    recall = recall_score(y_true.cpu(), pred_class.cpu(), zero_division=0)
    precision = precision_score(y_true.cpu(), pred_class.cpu(), zero_division=0)
    f1 = f1_score(y_true.cpu(), pred_class.cpu(), zero_division=0)
    return auc, recall, precision, f1

# Calculate dampened class weights for imbalance
num_positives = graph_data.y[graph_data.train_mask].sum().item()
num_negatives = graph_data.train_mask.sum().item() - num_positives
pos_weight_ratio = num_negatives / num_positives
# Use sqrt to dampen the effect of the large weight
dampened_weight = np.sqrt(pos_weight_ratio)
class_weights = torch.tensor([1.0, dampened_weight], dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
print(f"Original weight ratio: {pos_weight_ratio:.2f}")
print(f"Dampened positive class weight (sqrt): {dampened_weight:.2f}")


# --- Hyperparameters and Initialization ---
HIDDEN_CHANNELS = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
EPOCHS = 50000
EARLY_STOPPING_PATIENCE = 10
model = GraphSAGE(in_channels=graph_data.num_node_features, hidden_channels=HIDDEN_CHANNELS, out_channels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Initialize learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)

# UPDATED: Initialize history with separate keys for Train and Val
history = {
    'epoch': [], 
    'train_loss': [], 'val_loss': [],
    'train_auc': [], 'val_auc': [],
    'train_recall': [], 'val_recall': [],
    'train_precision': [], 'val_precision': [],
    'train_f1': [], 'val_f1': []
}
best_val_f1 = -1
best_epoch = 0
patience_counter = 0

start_time = time.time()
for epoch in range(1, EPOCHS + 1):
    loss = train(model, graph_data, optimizer, loss_fn)
    
    # Validation and logging every 10 epochs
    if epoch % 10 == 0:
        # Evaluate on Validation set
        val_auc, val_recall, val_precision, val_f1 = test(model, graph_data, graph_data.val_mask)
        
        # UPDATED: Evaluate on Training set
        train_auc, train_recall, train_precision, train_f1 = test(model, graph_data, graph_data.train_mask)
        
        # UPDATED: Calculate Validation Loss explicitly
        model.eval()
        with torch.no_grad():
            out = model(graph_data.x, graph_data.edge_index)
            val_loss = loss_fn(out[graph_data.val_mask], graph_data.y[graph_data.val_mask]).item()

        print(f'Epoch: {epoch:04d}, Train Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, '
              f'Val F1: {val_f1:.4f}, Val AUC: {val_auc:.4f}')
        
        # Store metrics
        history['epoch'].append(epoch)
        history['train_loss'].append(loss)
        history['val_loss'].append(val_loss)
        
        history['train_auc'].append(train_auc)
        history['val_auc'].append(val_auc)
        
        history['train_recall'].append(train_recall)
        history['val_recall'].append(val_recall)
        
        history['train_precision'].append(train_precision)
        history['val_precision'].append(val_precision)
        
        history['train_f1'].append(train_f1)
        history['val_f1'].append(val_f1)
        
        # Adjust learning rate based on validation F1
        scheduler.step(val_f1)
        
        # Early stopping logic
        current_metric = val_f1
        if current_metric > best_val_f1:
            best_val_f1 = current_metric
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pt')
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= EARLY_STOPPING_PATIENCE:
            print(f"\n--- Early stopping triggered at epoch {epoch} ---")
            break

print(f"\n--- Training Finished ---")
print(f"Total time: {(time.time() - start_time)/60:.2f} minutes")
print(f"Best validation F1-score: {best_val_f1:.4f} at epoch {best_epoch}")

### --- 4. Final Evaluation ---
print("\nStep 4: Evaluating on Test Set...")
# Load the best model saved during training
model.load_state_dict(torch.load('best_model.pt'))

# Evaluate on the test set using the default argmax (0.5) threshold
final_auc, final_recall, final_precision, final_f1 = test(model, graph_data, graph_data.test_mask)

print(f"\n--- Test Set Performance (using default argmax/0.5 threshold) ---")
print(f"AUC: {final_auc:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"F1-Score: {final_f1:.4f}")


### --- 5. Visualization ---
print("\nStep 5: Visualizing Training Process (Train vs Val)...")
fig, ax = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Training and Validation Metrics Comparison')

# Plot 1: Loss
ax[0, 0].plot(history['epoch'], history['train_loss'], label='Training Loss')
ax[0, 0].plot(history['epoch'], history['val_loss'], label='Validation Loss')
ax[0, 0].set_title('Loss')
ax[0, 0].legend()
ax[0, 0].grid(True)

# Plot 2: AUC
ax[0, 1].plot(history['epoch'], history['train_auc'], label='Training AUC', color='blue')
ax[0, 1].plot(history['epoch'], history['val_auc'], label='Validation AUC', color='orange')
ax[0, 1].set_title('AUC')
ax[0, 1].legend()
ax[0, 1].grid(True)

# Plot 3: Recall & Precision
# Solid line for Training, Dashed for Validation
ax[1, 0].plot(history['epoch'], history['train_recall'], label='Train Recall', color='green', linestyle='-')
ax[1, 0].plot(history['epoch'], history['val_recall'], label='Val Recall', color='green', linestyle='--')
ax[1, 0].plot(history['epoch'], history['train_precision'], label='Train Precision', color='red', linestyle='-')
ax[1, 0].plot(history['epoch'], history['val_precision'], label='Val Precision', color='red', linestyle='--')
ax[1, 0].set_title('Recall & Precision')
ax[1, 0].legend()
ax[1, 0].grid(True)

# Plot 4: F1-Score
ax[1, 1].plot(history['epoch'], history['train_f1'], label='Training F1', color='purple', linestyle='-')
ax[1, 1].plot(history['epoch'], history['val_f1'], label='Validation F1', color='purple', linestyle='--')
ax[1, 1].set_title('F1-Score')
ax[1, 1].legend()
ax[1, 1].grid(True)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
print("\n--- Part 2 Completed ---")

In [None]:
"""
Generates predictions using a trained GraphSAGE model on a target list.

This script executes Part 3 of the pipeline: Prediction.
It performs the following steps:
1.  Loads the pre-processed features and account mappings (from Part 1).
2.  Loads the raw transaction data to reconstruct the full graph structure.
3.  Loads the list of accounts requiring predictions.
4.  Re-defines the GraphSAGE model architecture (must match Part 2).
5.  Loads the saved weights of the best trained model (from Part 2).
6.  Performs a forward pass on the entire graph to get probabilities for all
    nodes.
7.  Extracts the "suspicious" probabilities for the target accounts.
8.  Applies an optimal threshold to classify accounts as 0 (normal) or
    1 (suspicious).
9.  Saves the final results to 'submission.csv'.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
import pandas as pd
import numpy as np

### --- 0. Environment Setup and Model Definition ---
print("--- Part 3: Prediction ---")
print("Step 0: Setup Environment and Model Definition...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model definition (must be identical to Part 2)
class GraphSAGE(nn.Module):
    """
    A 3-layer GraphSAGE model with Batch Normalization and Dropout.

    This model implements a stack of SAGEConv layers for node classification.
    The architecture is:
    SAGEConv -> BatchNorm -> ReLU -> Dropout
    SAGEConv -> BatchNorm -> ReLU -> Dropout
    SAGEConv (linear output)

    Args:
        in_channels (int): Dimensionality of the input node features.
        hidden_channels (int): Dimensionality of the hidden embeddings.
        out_channels (int): Dimensionality of the output (number of classes).
        dropout (float): Dropout probability.

    Attributes:
        conv1 (SAGEConv): The first graph convolutional layer.
        bn1 (nn.BatchNorm1d): Batch normalization for the first hidden layer.
        conv2 (SAGEConv): The second graph convolutional layer.
        bn2 (nn.BatchNorm1d): Batch normalization for the second hidden layer.
        conv3 (SAGEConv): The output graph convolutional layer.
        dropout (nn.Dropout): Dropout layer.
    """
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        """Initializes the GraphSAGE model layers."""
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index):
        """
        Defines the forward pass of the GraphSAGE model.

        Args:
            x (torch.Tensor): The input node feature tensor.
            edge_index (torch.Tensor): The graph's edge index.

        Returns:
            torch.Tensor: The raw output logits for each node.
        """
        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x, edge_index)))
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        return x

### --- 1. Loading Necessary Data ---
print("\nStep 1: Loading necessary data...")
try:
    # Load the correct feature file generated by Part 1 (with interaction features)
    feature_file = 'processed_features_with_interactions.pt'
    saved_data = torch.load(feature_file, map_location=device, weights_only=False)
    
    feature_tensor = saved_data['feature_tensor']
    acct_to_idx = saved_data['acct_to_idx']
    print(f"Pre-processed features '{feature_file}' and mappings loaded.")
except FileNotFoundError:
    print(f"Error: '{feature_file}' not found. Please run Part 1 first.")
    exit()

try:
    # ðŸŽ¯ Ensure these paths are correct
    predict_df = pd.read_csv("data/acct_predict.csv", dtype={'acct': str})
    df_trans = pd.read_csv("data/acct_transaction.csv", dtype={'from_acct': str, 'to_acct': str})
    print("Prediction account list and raw transactions loaded.")
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    exit()

### --- 2. Reconstructing the Full Graph ---
print("\nStep 2: Reconstructing the full graph...")
# Ensure only accounts present in the feature mapping are used to build edges
valid_trans = df_trans[df_trans['from_acct'].isin(acct_to_idx) & df_trans['to_acct'].isin(acct_to_idx)]
source_nodes = valid_trans['from_acct'].map(acct_to_idx).values
target_nodes = valid_trans['to_acct'].map(acct_to_idx).values
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
edge_index = to_undirected(edge_index) # Maintain consistency with training

# Create the Data object using the feature_tensor loaded in Step 1
full_graph_data = Data(x=feature_tensor, edge_index=edge_index).to(device)
print(f"Full graph data object reconstructed ({full_graph_data.num_nodes} nodes).")

### --- 3. Loading Trained Model and Making Predictions ---
print("\nStep 3: Loading trained model and making predictions...")
# Hyperparameters must match those used in Part 2
HIDDEN_CHANNELS = 128 
model = GraphSAGE(in_channels=full_graph_data.num_node_features, 
                  hidden_channels=HIDDEN_CHANNELS, 
                  out_channels=2).to(device)

try:
    # Load the trained model weights from Part 2
    model_file = 'best_model.pt'
    model.load_state_dict(torch.load(model_file, map_location=device))
    print(f"Trained model weights loaded from '{model_file}'.")
except FileNotFoundError:
    print(f"Error: '{model_file}' not found. Please run Part 2 to train the model first.")
    exit()

model.eval()
with torch.no_grad():
    all_logits = model(full_graph_data.x, full_graph_data.edge_index)
    all_probs = F.softmax(all_logits, dim=1)
print("Predictions generated for all nodes in the graph.")

### --- 4. Extracting Results and Generating Submission File ---
print("\nStep 4: Extracting results and generating submission file...")

OPTIMAL_THRESHOLD = 0.5
print(f"Using threshold: {OPTIMAL_THRESHOLD}")

# Get the "class 1" (suspicious) probabilities for all nodes
all_pos_probs = all_probs[:, 1].cpu().numpy()

# Get the indices for the accounts in the prediction list
predict_indices = [acct_to_idx.get(acct, -1) for acct in predict_df['acct']]

target_pos_probs = []
for idx in predict_indices:
    if idx != -1:
        # If the account is in the graph, get its suspicious probability
        target_pos_probs.append(all_pos_probs[idx])
    else:
        # If the account is not in the graph (e.g., a new account in 'acct_predict'),
        # default its suspicious probability to 0.0 (label 0)
        target_pos_probs.append(0.0)

# Use the OPTIMAL_THRESHOLD to determine the final labels
target_pos_probs_np = np.array(target_pos_probs)
pred_labels = (target_pos_probs_np > OPTIMAL_THRESHOLD).astype(int)

# Generate the submission file
submission_df = predict_df[['acct']].copy()
submission_df['label'] = pred_labels
submission_df.to_csv("submission.csv", index=False)

print("\nsubmission.csv created successfully!")
print("Predicted label distribution (based on threshold):")
print(pd.Series(pred_labels).value_counts())
print("\n--- Part 3 Completed ---")