In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data
import numpy as np
from sklearn.model_selection import train_test_split
DATA_PATH_FEATURES = '../data/elliptic_txs_features.csv'
DATA_PATH_CLASSES = '../data/elliptic_txs_classes.csv'
DATA_PATH_EDGES = '../data/elliptic_txs_edgelist.csv'

print("Loading Data... (This will take 1-2 minutes)")
df_classes = pd.read_csv(DATA_PATH_CLASSES)
df_features = pd.read_csv(DATA_PATH_FEATURES, header=None)
df_features.rename(columns={0: 'txId', 1: 'time_step'}, inplace=True)
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
nodes = df_merged['txId'].values
map_id = {j: i for i, j in enumerate(nodes)}
print("Loading Edges and mapping IDs...")
df_edges = pd.read_csv(DATA_PATH_EDGES)
df_edges = df_edges[df_edges['txId1'].isin(nodes) & df_edges['txId2'].isin(nodes)]
src = df_edges['txId1'].map(map_id).values
dst = df_edges['txId2'].map(map_id).values
edge_index = torch.tensor([src, dst], dtype=torch.long)
node_features = df_merged.drop(columns=['txId', 'class', 'time_step']).values
x = torch.tensor(node_features, dtype=torch.float)
df_merged['class'] = df_merged['class'].map({'1': 1, '2': 0, 'unknown': -1})
y = torch.tensor(df_merged['class'].values, dtype=torch.long)
node_time = torch.tensor(df_merged['time_step'].values, dtype=torch.long)
print("Switching to Random Split...")
all_indices = np.arange(data.num_nodes)
labeled_indices = all_indices[y != -1]
train_idx, test_idx = train_test_split(labeled_indices, test_size=0.3, random_state=42)
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[train_idx] = True
data.test_mask[test_idx] = True
print(f"Random Split Created! Training on {len(train_idx)} nodes.")
data = Data(x=x, edge_index=edge_index, y=y)
data.train_mask = train_mask
data.test_mask = test_mask

print("Graph Constructed Successfully!")
print(f"Number of Nodes: {data.num_nodes}")
print(f"Number of Edges: {data.num_edges}")
print(f"Training Nodes: {data.train_mask.sum().item()}")
print(f"Test Nodes: {data.test_mask.sum().item()}")

Loading Data... (This will take 1-2 minutes)
Loading Edges and mapping IDs...
Switching to Random Split...
Random Split Created! Training on 32594 nodes.
Graph Constructed Successfully!
Number of Nodes: 203769
Number of Edges: 234355
Training Nodes: 29894
Test Nodes: 16670


In [None]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.metrics import classification_report
class FraudGNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(data.num_node_features, 128)
        self.conv2 = SAGEConv(128, 2)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Training on: {device}")
model = FraudGNN().to(device)
data = data.to(device)
num_licit = (data.y[data.train_mask] == 0).sum().item()
num_fraud = (data.y[data.train_mask] == 1).sum().item()
weight = torch.tensor([1.0, num_licit / num_fraud], dtype=torch.float).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.NLLLoss(weight=weight)
print("Starting Graph Training...")
for epoch in range(101):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss.item():.4f}')
print("\n--- GraphSAGE Evaluation ---")
model.eval()
pred = model(data).argmax(dim=1)
y_true = data.y[data.test_mask].cpu().numpy()
y_pred = pred[data.test_mask].cpu().numpy()

print(classification_report(y_true, y_pred, target_names=['Licit (0)', 'Fraud (1)']))

Training on: cpu
Starting Graph Training...
Epoch 000, Loss: 0.8629
Epoch 010, Loss: 0.2525
Epoch 020, Loss: 0.1772
Epoch 030, Loss: 0.1403
Epoch 040, Loss: 0.1232
Epoch 050, Loss: 0.1090
Epoch 060, Loss: 0.0948
Epoch 070, Loss: 0.0840
Epoch 080, Loss: 0.0775
Epoch 090, Loss: 0.0735
Epoch 100, Loss: 0.0683

--- GraphSAGE Evaluation ---
              precision    recall  f1-score   support

   Licit (0)       0.98      0.96      0.97     15587
   Fraud (1)       0.53      0.65      0.59      1083

    accuracy                           0.94     16670
   macro avg       0.75      0.81      0.78     16670
weighted avg       0.95      0.94      0.94     16670

