In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv
from sklearn.metrics import classification_report
import numpy as np
DATA_PATH_FEATURES = '../data/elliptic_txs_features.csv'
DATA_PATH_CLASSES = '../data/elliptic_txs_classes.csv'
DATA_PATH_EDGES = '../data/elliptic_txs_edgelist.csv'

print("Loading Data... (This takes ~1 minute)")

df_classes = pd.read_csv(DATA_PATH_CLASSES)
df_features = pd.read_csv(DATA_PATH_FEATURES, header=None)
df_features.rename(columns={0: 'txId', 1: 'time_step'}, inplace=True)
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
nodes = df_merged['txId'].values
map_id = {j: i for i, j in enumerate(nodes)}
df_edges = pd.read_csv(DATA_PATH_EDGES)
df_edges = df_edges[df_edges['txId1'].isin(nodes) & df_edges['txId2'].isin(nodes)]
src = df_edges['txId1'].map(map_id).values
dst = df_edges['txId2'].map(map_id).values
edge_index = torch.tensor([src, dst], dtype=torch.long)
node_features = df_merged.drop(columns=['txId', 'class', 'time_step']).values
x = torch.tensor(node_features, dtype=torch.float)
df_merged['class'] = df_merged['class'].map({'1': 1, '2': 0, 'unknown': -1})
y = torch.tensor(df_merged['class'].values, dtype=torch.long)
node_time = torch.tensor(df_merged['time_step'].values, dtype=torch.long)
train_mask = (node_time <= 34) & (y != -1)
test_mask = (node_time > 34) & (y != -1)
data = Data(x=x, edge_index=edge_index, y=y)
data.train_mask = train_mask
data.test_mask = test_mask

print(f"Graph Built! Nodes: {data.num_nodes}, Edges: {data.num_edges}")
class FraudGAT(torch.nn.Module):
    def __init__(self, hidden_channels=128, heads=4):
        super().__init__()
        self.conv1 = GATv2Conv(data.num_node_features, hidden_channels, heads=heads, dropout=0.3)
        self.conv2 = GATv2Conv(hidden_channels * heads, 2, heads=1, concat=False, dropout=0.3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.4, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Training on: {device}")

model = FraudGAT().to(device)
data = data.to(device)
num_licit = (data.y[data.train_mask] == 0).sum().item()
num_fraud = (data.y[data.train_mask] == 1).sum().item()
weight = torch.tensor([1.0, num_licit / num_fraud], dtype=torch.float).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.NLLLoss(weight=weight)

print("Starting GAT Training...")
for epoch in range(151):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss.item():.4f}')
print("\n--- GAT Evaluation (Temporal Split) ---")
model.eval()
pred = model(data).argmax(dim=1)

y_true = data.y[data.test_mask].cpu().numpy()
y_pred = pred[data.test_mask].cpu().numpy()

print(classification_report(y_true, y_pred, target_names=['Licit (0)', 'Fraud (1)']))

Loading Data... (This takes ~1 minute)


  edge_index = torch.tensor([src, dst], dtype=torch.long)


Graph Built! Nodes: 203769, Edges: 234355
Training on: cpu
Starting GAT Training...
Epoch 000, Loss: 0.8611
Epoch 020, Loss: 0.3681
Epoch 040, Loss: 0.3134
Epoch 060, Loss: 0.2843
Epoch 080, Loss: 0.2782
Epoch 100, Loss: 0.2698
Epoch 120, Loss: 0.2484
Epoch 140, Loss: 0.2501

--- GAT Evaluation (Temporal Split) ---
              precision    recall  f1-score   support

   Licit (0)       0.98      0.88      0.93     15587
   Fraud (1)       0.30      0.74      0.42      1083

    accuracy                           0.87     16670
   macro avg       0.64      0.81      0.67     16670
weighted avg       0.94      0.87      0.89     16670

