<a href="https://colab.research.google.com/github/BayramovaNazrin/illicit-btc-detection/blob/main/graphsage_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load Data**

In [1]:
!git clone https://github.com/BayramovaNazrin/illicit-btc-detection.git
%cd /content/illicit-btc-detection

Cloning into 'illicit-btc-detection'...
remote: Enumerating objects: 196, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 196 (delta 26), reused 14 (delta 14), pack-reused 158 (from 1)[K
Receiving objects: 100% (196/196), 6.82 MiB | 23.13 MiB/s, done.
Resolving deltas: 100% (79/79), done.
/content/illicit-btc-detection


In [4]:
import sys
sys.path.append('/content/illicit-btc-detection')

from load_data import load_data
features, edges, classes, merged_df = load_data()

# **Imports**

In [None]:
import os, torch

# Remove any old or incompatible builds
!pip uninstall -y torch torchvision torchaudio torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv > /dev/null

# Install supported versions (PyTorch 2.5 + cu121)
!pip install -q torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121

# Install matching PyTorch Geometric wheels
!pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
    -f https://data.pyg.org/whl/torch-2.5.0+cu121.html

# Verify installation
import torch_geometric
from torch_geometric.nn import SAGEConv

print("Torch:", torch.__version__)
print("PyG origin:", torch_geometric.__file__)
print("SAGEConv import OK")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m601.3 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m994.8/994.8 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, torch
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score, accuracy_score

# **Data Loading and Preprocessing**

In [None]:

def load_data(features_path, edges_path, classes_path):
    print("Loading data...")
    features_df = pd.read_csv(features_path).fillna(0)
    edges_df = pd.read_csv(edges_path)
    classes_df = pd.read_csv(classes_path)

    # Map textual labels to numeric
    classes_df['class'] = classes_df['class'].map({'unknown': 3, '1': 1, '2': 2})
    return features_df, edges_df, classes_df

def preprocess_and_create_graph(features_df, edges_df, classes_df):
    print("Preprocessing and constructing graph...")

    combined_df = pd.merge(features_df, classes_df, on='txId', how='inner')
    labeled_df = combined_df[combined_df['class'].isin([1, 2])].copy()
    labeled_df['class'] = labeled_df['class'].map({1: 1, 2: 0})
    labeled_df = labeled_df.sort_values('txId').reset_index(drop=True)

    txid_map = {txid: i for i, txid in enumerate(labeled_df['txId'])}
    feature_cols = labeled_df.columns.drop(['txId', 'Time step', 'class'])
    node_features = labeled_df[feature_cols].values
    scaler = StandardScaler()
    node_features_scaled = scaler.fit_transform(node_features)
    x = torch.tensor(node_features_scaled, dtype=torch.float)

    valid_edges = edges_df[edges_df['txId1'].isin(txid_map) & edges_df['txId2'].isin(txid_map)].copy()
    valid_edges['txId1_idx'] = valid_edges['txId1'].map(txid_map)
    valid_edges['txId2_idx'] = valid_edges['txId2'].map(txid_map)
    edge_index = torch.tensor(valid_edges[['txId1_idx', 'txId2_idx']].values.T, dtype=torch.long)

    y = torch.tensor(labeled_df['class'].values, dtype=torch.long)

    time_steps = torch.tensor(labeled_df['Time step'].values)
    train_mask = (time_steps <= 34)
    test_mask = (time_steps > 34)

    graph_data = Data(x=x, edge_index=edge_index, y=y)
    graph_data.train_mask = train_mask
    graph_data.test_mask = test_mask

    print("\nGraph construction complete:")
    print(f"  - Nodes: {graph_data.num_nodes:,}")
    print(f"  - Edges: {graph_data.num_edges:,}")
    print(f"  - Labeled for Training: {graph_data.train_mask.sum():,}")
    print(f"  - Labeled for Testing: {graph_data.test_mask.sum():,}\n")

    return graph_data


# **Model Definition: GraphSAGE**

In [None]:

class GraphSAGEModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGEModel, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


# **Training and Evaluation Functions**

In [None]:

def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(model, data):
    model.eval()
    out = model(data.x, data.edge_index)
    pred_probs = F.softmax(out, dim=1)
    preds = pred_probs.argmax(dim=1)

    test_preds = preds[data.test_mask].cpu().numpy()
    test_labels = data.y[data.test_mask].cpu().numpy()
    test_probs = pred_probs[data.test_mask][:, 1].cpu().numpy()

    accuracy = accuracy_score(test_labels, test_preds)
    f1 = f1_score(test_labels, test_preds, average='binary')
    precision = precision_score(test_labels, test_preds, average='binary')
    recall = recall_score(test_labels, test_preds, average='binary')
    roc_auc = roc_auc_score(test_labels, test_probs)
    pr_auc = average_precision_score(test_labels, test_probs)

    return {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc
    }


# **Main Training Execution**

In [None]:

if __name__ == '__main__':
    features_path = '/content/drive/MyDrive/anomaly_detection/txs_features.csv'
    edges_path = '/content/drive/MyDrive/anomaly_detection/elliptic_txs_edgelist.csv'
    classes_path = '/content/drive/MyDrive/anomaly_detection/elliptic_txs_classes.csv'

    features_df, edges_df, classes_df = load_data(features_path, edges_path, classes_path)
    graph_data = preprocess_and_create_graph(features_df, edges_df, classes_df)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}\n")
    graph_data = graph_data.to(device)

    num_node_features = graph_data.num_node_features
    num_classes = 2
    hidden_dim = 128

    model = GraphSAGEModel(in_channels=num_node_features, hidden_channels=hidden_dim, out_channels=num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    epochs = 200
    print("Starting training...")
    for epoch in range(1, epochs + 1):
        loss = train(model, graph_data, optimizer, criterion)
        if epoch % 10 == 0:
            metrics = test(model, graph_data)
            print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Accuracy: {metrics['accuracy']:.4f}, Test F1: {metrics['f1_score']:.4f}")

    print("\nTraining finished. Final evaluation:")
    final_metrics = test(model, graph_data)
    for metric, value in final_metrics.items():
        print(f"  - {metric.replace('_', ' ').title()}: {value:.4f}")


Loading data...
Preprocessing and constructing graph...

Graph construction complete:
  - Nodes: 46,564
  - Edges: 36,624
  - Labeled for Training: 29,894
  - Labeled for Testing: 16,670

Using device: cpu

Starting training...
Epoch: 010, Loss: 0.1971, Test Accuracy: 0.9423, Test F1: 0.5458
Epoch: 020, Loss: 0.1513, Test Accuracy: 0.9422, Test F1: 0.5665
Epoch: 030, Loss: 0.1174, Test Accuracy: 0.9425, Test F1: 0.5565
Epoch: 040, Loss: 0.0981, Test Accuracy: 0.9443, Test F1: 0.5664
Epoch: 050, Loss: 0.0862, Test Accuracy: 0.9458, Test F1: 0.5784
Epoch: 060, Loss: 0.0772, Test Accuracy: 0.9493, Test F1: 0.6006
Epoch: 070, Loss: 0.0727, Test Accuracy: 0.9536, Test F1: 0.6242
Epoch: 080, Loss: 0.0696, Test Accuracy: 0.9564, Test F1: 0.6403
Epoch: 090, Loss: 0.0651, Test Accuracy: 0.9581, Test F1: 0.6442
Epoch: 100, Loss: 0.0641, Test Accuracy: 0.9647, Test F1: 0.6849
Epoch: 110, Loss: 0.0616, Test Accuracy: 0.9645, Test F1: 0.6838
Epoch: 120, Loss: 0.0588, Test Accuracy: 0.9638, Test F1: