In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

print("Libraries imported successfully.")
print(f"PyTorch version: {torch.__version__}")

Libraries imported successfully.
PyTorch version: 2.5.0+cpu


 Load and Preprocess Data

In [2]:
try:
    df = pd.read_csv('../data/raw/PS_20174392719_1491204439457_log.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found.")
    df = None

if df is not None:
    # Use a smaller, balanced sample for faster training in this example
    df_fraud = df[df['isFraud'] == 1]
    df_normal = df[df['isFraud'] == 0].sample(n=len(df_fraud) * 5, random_state=42)
    df_sample = pd.concat([df_fraud, df_normal]).reset_index(drop=True)
    print(f"Using a sample of {len(df_sample)} transactions for GNN training.")


Dataset loaded successfully.
Using a sample of 49278 transactions for GNN training.


Construct the Graph

In [3]:
if df is not None:
    print("Constructing graph data structure...")
    
    # Create a mapping from account names to integer indices
    all_accounts = pd.concat([df_sample['nameOrig'], df_sample['nameDest']]).unique()
    account_map = {name: i for i, name in enumerate(all_accounts)}
    num_nodes = len(all_accounts)

    # Create edge index
    source_nodes = df_sample['nameOrig'].map(account_map).values
    dest_nodes = df_sample['nameDest'].map(account_map).values
    edge_index = torch.tensor([source_nodes, dest_nodes], dtype=torch.long)

    # Create edge features (attributes)
    edge_features = df_sample[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].values
    scaler = StandardScaler()
    edge_features = scaler.fit_transform(edge_features)
    edge_attr = torch.tensor(edge_features, dtype=torch.float)
    
    # Create node features (using transaction amounts for simplicity)
    # A more advanced approach would aggregate features per node
    node_features = np.zeros((num_nodes, 1))
    df_sample['amount_log'] = np.log1p(df_sample['amount'])
    
    # Aggregate features for each node (e.g., total amount sent/received)
    sender_amounts = df_sample.groupby('nameOrig')['amount_log'].sum()
    receiver_amounts = df_sample.groupby('nameDest')['amount_log'].sum()
    
    for name, idx in account_map.items():
        node_features[idx, 0] = sender_amounts.get(name, 0) + receiver_amounts.get(name, 0)

    node_features = scaler.fit_transform(node_features)
    x = torch.tensor(node_features, dtype=torch.float)

    # Create labels (for transactions/edges)
    y = torch.tensor(df_sample['isFraud'].values, dtype=torch.float)

    # Create the PyG Data object
    graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    
    print("Graph data object created:")
    print(graph_data)


Constructing graph data structure...


  edge_index = torch.tensor([source_nodes, dest_nodes], dtype=torch.long)


Graph data object created:
Data(x=[96883, 1], edge_index=[2, 49278], edge_attr=[49278, 5], y=[49278])


Define the GNN Model

In [4]:
class GNN(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_channels):
        super(GNN, self).__init__()
        self.conv1 = SAGEConv(num_node_features, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        # Final layer predicts a single value per edge (transaction)
        self.output_layer = torch.nn.Linear(2 * hidden_channels + num_edge_features, 1)

    def forward(self, x, edge_index, edge_attr):
        # Get node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)

        # For each edge, concatenate the source node, dest node, and edge features
        source_node_embed = x[edge_index[0]]
        dest_node_embed = x[edge_index[1]]
        
        combined_features = torch.cat([source_node_embed, dest_node_embed, edge_attr], dim=1)
        
        # Predict
        out = self.output_layer(combined_features)
        return torch.sigmoid(out).squeeze()


Training the GNN 

In [5]:
if 'graph_data' in locals():
    model = GNN(
        num_node_features=graph_data.num_node_features,
        num_edge_features=graph_data.num_edge_features,
        hidden_channels=64
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.BCELoss() # Binary Cross-Entropy Loss

    # Create a mask for training (we're training on edges)
    train_mask = torch.ones(graph_data.num_edges, dtype=torch.bool)

    print("\nStarting GNN training...")
    for epoch in range(100):
        model.train()
        optimizer.zero_grad()
        out = model(graph_data.x, graph_data.edge_index, graph_data.edge_attr)
        loss = criterion(out[train_mask], graph_data.y[train_mask])
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch: {epoch+1:03d}, Loss: {loss:.4f}')
    print("GNN training completed.")



Starting GNN training...
Epoch: 010, Loss: 0.3620
Epoch: 020, Loss: 0.3350
Epoch: 030, Loss: 0.3295
Epoch: 040, Loss: 0.3247
Epoch: 050, Loss: 0.3214
Epoch: 060, Loss: 0.3189
Epoch: 070, Loss: 0.3169
Epoch: 080, Loss: 0.3153
Epoch: 090, Loss: 0.3138
Epoch: 100, Loss: 0.3124
GNN training completed.


Evaluate the GNN

In [6]:
if 'model' in locals():
    model.eval()
    with torch.no_grad():
        out = model(graph_data.x, graph_data.edge_index, graph_data.edge_attr)
        preds = (out > 0.5).float()
        correct = (preds == graph_data.y).sum()
        accuracy = int(correct) / len(graph_data.y)
        print(f'\nGNN Model Accuracy: {accuracy:.4f}')


GNN Model Accuracy: 0.8867


Save the GNN Model and Artifacts

In [7]:
if 'model' in locals():
    # Save the trained GNN model
    model_path = '../models/gnn_model.pt'
    torch.save(model.state_dict(), model_path)
    print(f"\nGNN model saved to {model_path}")

    # Save the account map needed for inference
    map_path = '../models/gnn_account_map.pkl'
    joblib.dump(account_map, map_path)
    print(f"Account map saved to {map_path}")



GNN model saved to ../models/gnn_model.pt
Account map saved to ../models/gnn_account_map.pkl
