In [None]:
import pandas as pd
import torch
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv
import torch.nn.functional as F
DATA_PATH_FEATURES = '../data/elliptic_txs_features.csv'
DATA_PATH_CLASSES = '../data/elliptic_txs_classes.csv'
DATA_PATH_EDGES = '../data/elliptic_txs_edgelist.csv'

print("Loading Data for Hybrid Model...")
df_classes = pd.read_csv(DATA_PATH_CLASSES)
df_features = pd.read_csv(DATA_PATH_FEATURES, header=None)
df_features.rename(columns={0: 'txId', 1: 'time_step'}, inplace=True)
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')

nodes = df_merged['txId'].values
map_id = {j: i for i, j in enumerate(nodes)}

df_edges = pd.read_csv(DATA_PATH_EDGES)
df_edges = df_edges[df_edges['txId1'].isin(nodes) & df_edges['txId2'].isin(nodes)]
src = df_edges['txId1'].map(map_id).values
dst = df_edges['txId2'].map(map_id).values
edge_index = torch.tensor([src, dst], dtype=torch.long)

node_features = df_merged.drop(columns=['txId', 'class', 'time_step']).values
x = torch.tensor(node_features, dtype=torch.float)

df_merged['class'] = df_merged['class'].map({'1': 1, '2': 0, 'unknown': -1})
y = df_merged['class'].values
node_time = df_merged['time_step'].values
train_mask = (node_time <= 34) & (y != -1)
test_mask = (node_time > 34) & (y != -1)
data = Data(x=x, edge_index=edge_index)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

class FraudGAT(torch.nn.Module):
    def __init__(self, hidden_channels=128, heads=4):
        super().__init__()
        self.conv1 = GATv2Conv(data.num_node_features, hidden_channels, heads=heads, dropout=0.3)
        self.conv2 = GATv2Conv(hidden_channels * heads, 2, heads=1, concat=False, dropout=0.3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        return x 

print("Extracting Graph Embeddings from GAT...")
model = FraudGAT().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
model.train()
for epoch in range(51):
    optimizer.zero_grad()
    embeddings = model(data) 
    out = torch.matmul(embeddings, torch.randn(embeddings.shape[1], 2).to(device)) 
    loss = F.cross_entropy(out[train_mask], torch.tensor(y[train_mask]).long().to(device))
    loss.backward()
    optimizer.step()
model.eval()
with torch.no_grad():
    graph_embeddings = model(data).cpu().numpy()
print(f"Embeddings Extracted. Shape: {graph_embeddings.shape}")
X_hybrid = np.hstack((node_features, graph_embeddings))
print("Training Hybrid XGBoost...")
X_train = X_hybrid[train_mask]
y_train = y[train_mask]
X_test = X_hybrid[test_mask]
y_test = y[test_mask]
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    scale_pos_weight=5.0, 
    random_state=42
)
xgb.fit(X_train, y_train)
print("\n--- Hybrid Ensemble Evaluation (Temporal Split) ---")
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Licit (0)', 'Fraud (1)']))

Loading Data for Hybrid Model...


  edge_index = torch.tensor([src, dst], dtype=torch.long)


Extracting Graph Embeddings from GAT...
Embeddings Extracted. Shape: (203769, 512)
Training Hybrid XGBoost...

--- Hybrid Ensemble Evaluation (Temporal Split) ---
              precision    recall  f1-score   support

   Licit (0)       0.98      0.99      0.99     15587
   Fraud (1)       0.89      0.73      0.80      1083

    accuracy                           0.98     16670
   macro avg       0.94      0.86      0.89     16670
weighted avg       0.98      0.98      0.98     16670

