In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import to_undirected
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD
import rdflib
from pathlib import Path

from tqdm import tqdm 

print(f"PyTorch version: {torch.__version__}")
try:
    import torch_geometric
    print(f"PyG version: {torch_geometric.__version__}")
except ImportError:
    print("PyTorch Geometric not found. Please install it.")

PROJ = Namespace("http://example.com/catalonia-ontology/")

# --- Configuration ---
KG_PATH = "../../data/exploitation/knowledge_graph.ttl"
EMBEDDING_DIM = 64 # Final dimension of the node embeddings
HIDDEN_CHANNELS = 128 # Number of channels in the hidden GNN layer

  Referenced from: <D39B31F4-BCFB-3005-A82F-3F010BF36435> /Users/rogerbaigess/Desktop/IA/3r/BDA/large-scale-data-engineering-ai/.venv/lib/python3.10/site-packages/libpyg.so
  Expected in:     <6B8AC17B-04CC-36D0-BD01-780381EFB0CC> /Users/rogerbaigess/Desktop/IA/3r/BDA/large-scale-data-engineering-ai/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib
  Referenced from: <B61D6BDA-DF58-31FC-B7D3-5F1A6FBC154B> /Users/rogerbaigess/Desktop/IA/3r/BDA/large-scale-data-engineering-ai/.venv/lib/python3.10/site-packages/torch_scatter/_version_cpu.so
  Expected in:     <6B8AC17B-04CC-36D0-BD01-780381EFB0CC> /Users/rogerbaigess/Desktop/IA/3r/BDA/large-scale-data-engineering-ai/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib
  Referenced from: <7A612FE8-A2BB-3D1C-A452-5845B6B47F4D> /Users/rogerbaigess/Desktop/IA/3r/BDA/large-scale-data-engineering-ai/.venv/lib/python3.10/site-packages/torch_sparse/_version_cpu.so
  Expected in:     <6B8AC17B-04CC-36D0-BD01-780381EFB0CC> 

PyTorch version: 2.7.1
PyG version: 2.6.1


In [2]:
# --- Load the full graph ---
print(f"Loading full Knowledge Graph from: {KG_PATH}")
g = Graph()
g.parse(KG_PATH, format="turtle")
print(f"Graph loaded successfully with {len(g)} triples.")

# --- Prepare data structures for PyG ---
print("\n--- Preparing data for PyTorch Geometric ---")

# 1. Map all unique entity URIs to integer indices
all_uris = sorted([node for node in set(g.subjects()).union(set(g.objects())) if isinstance(node, rdflib.URIRef)])
node_to_idx = {node: i for i, node in enumerate(all_uris)}
idx_to_node = {i: node for i, node in enumerate(all_uris)}
num_nodes = len(all_uris)
print(f"Mapped {num_nodes} unique entity nodes to integer indices.")

# 2. Create the edge index tensor
edge_list = []
for s, p, o in g:
    if isinstance(s, rdflib.URIRef) and isinstance(o, rdflib.URIRef):
        s_idx, o_idx = node_to_idx.get(s), node_to_idx.get(o)
        if s_idx is not None and o_idx is not None:
            edge_list.append([s_idx, o_idx])

edge_index = to_undirected(torch.tensor(edge_list, dtype=torch.long).t())
print(f"Created undirected edge_index tensor with shape: {edge_index.shape}")

# 3. Create Node Features (the 'x' matrix) from literals
print("\nExtracting numerical features for each node...")
features_df = pd.DataFrame(index=range(num_nodes))

feature_properties = {
    PROJ.hasValue: 'idescat_value',
    PROJ.avgMonthlyRent: 'annual_rent',
    PROJ.householdIncome: 'annual_income',
    PROJ.totalContracts: 'annual_contracts'
}

for feat_name in feature_properties.values():
    features_df[feat_name] = np.nan

for i in tqdm(range(num_nodes), desc="Extracting Node Features"):
    node_uri = idx_to_node[i]
    for p, o in g.predicate_objects(subject=node_uri):
        if p in feature_properties:
            feat_name = feature_properties[p]
            try:
                features_df.loc[i, feat_name] = float(o)
            except (ValueError, TypeError):
                continue

# --- KEY FIX: Robustly fill NaN values with 0 ---
# Using fillna(0) is safer than fillna(mean()) when some columns might be all NaN.
features_df = features_df.fillna(0)
print("\nFilled NaN values with 0.")
# -----------------------------------------------

print(f"Created features DataFrame with shape: {features_df.shape}")
print("Sample of extracted features before scaling:")
print(features_df.describe())

# 4. Normalize features
print("\nNormalizing node features...")
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)
x = torch.tensor(features_scaled, dtype=torch.float)
print(f"Created node feature tensor 'x' with shape: {x.shape}")

# 5. Create the final PyG Data object
data = Data(x=x, edge_index=edge_index)
print("\nPyG Data object created successfully:")
print(data)

Loading full Knowledge Graph from: ../../data/exploitation/knowledge_graph.ttl
Graph loaded successfully with 180513 triples.

--- Preparing data for PyTorch Geometric ---
Mapped 29332 unique entity nodes to integer indices.
Created undirected edge_index tensor with shape: torch.Size([2, 150632])

Extracting numerical features for each node...


Extracting Node Features: 100%|██████████| 29332/29332 [00:01<00:00, 16270.85it/s]



Filled NaN values with 0.
Created features DataFrame with shape: (29332, 4)
Sample of extracted features before scaling:
       idescat_value   annual_rent  annual_income  annual_contracts
count        29332.0  29332.000000        29332.0      29332.000000
mean             0.0    250.394189            0.0        124.615096
std              0.0    282.898497            0.0       1935.808171
min              0.0      0.000000            0.0          0.000000
25%              0.0      0.000000            0.0          0.000000
50%              0.0      0.000000            0.0          0.000000
75%              0.0    456.335000            0.0         14.000000
max              0.0   1649.991000            0.0     165779.000000

Normalizing node features...
Created node feature tensor 'x' with shape: torch.Size([29332, 4])

PyG Data object created successfully:
Data(x=[29332, 4], edge_index=[2, 150632])


In [3]:
# Cell 3: Manual Edge Split for Link Prediction

import torch
from torch_geometric.utils import negative_sampling

print("\n--- Manually Splitting Edges for Link Prediction ---")

# 1. Get all positive edges from the original graph
num_nodes = data.num_nodes
all_pos_edges = data.edge_index

# 2. Split positive edges into train, validation, and test sets
num_total_pos_edges = all_pos_edges.size(1)
perm = torch.randperm(num_total_pos_edges) # Shuffle the edges

# Define split sizes
num_val = int(num_total_pos_edges * 0.1)
num_test = int(num_total_pos_edges * 0.1)
num_train = num_total_pos_edges - num_val - num_test

# Assign edges to splits
train_pos_edges = all_pos_edges[:, perm[:num_train]]
val_pos_edges = all_pos_edges[:, perm[num_train : num_train + num_val]]
test_pos_edges = all_pos_edges[:, perm[num_train + num_val:]]

print(f"Split positive edges: Train={train_pos_edges.size(1)}, Val={val_pos_edges.size(1)}, Test={test_pos_edges.size(1)}")

# 3. Create the training graph (IMPORTANT: it should contain only the training edges for message passing)
# This prevents data leakage from validation/test edges into the training embeddings.
train_data = Data(x=data.x, edge_index=train_pos_edges)
val_data = Data(x=data.x, edge_index=train_pos_edges) # Val and Test also use train_pos_edges for message passing
test_data = Data(x=data.x, edge_index=train_pos_edges)


# 4. Generate negative edges and labels for each split
print("Generating negative samples for each split...")

# For the training set
train_neg_edges = negative_sampling(
    edge_index=all_pos_edges, # Sample from all edges to avoid sampling validation/test positives
    num_nodes=num_nodes,
    num_neg_samples=train_pos_edges.size(1) # Match number of positive samples
)
train_data.edge_label_index = torch.cat([train_pos_edges, train_neg_edges], dim=-1)
train_data.edge_label = torch.cat([torch.ones(train_pos_edges.size(1)), torch.zeros(train_neg_edges.size(1))], dim=0)

# For the validation set
val_neg_edges = negative_sampling(
    edge_index=all_pos_edges,
    num_nodes=num_nodes,
    num_neg_samples=val_pos_edges.size(1)
)
val_data.edge_label_index = torch.cat([val_pos_edges, val_neg_edges], dim=-1)
val_data.edge_label = torch.cat([torch.ones(val_pos_edges.size(1)), torch.zeros(val_neg_edges.size(1))], dim=0)

# For the test set
test_neg_edges = negative_sampling(
    edge_index=all_pos_edges,
    num_nodes=num_nodes,
    num_neg_samples=test_pos_edges.size(1)
)
test_data.edge_label_index = torch.cat([test_pos_edges, test_neg_edges], dim=-1)
test_data.edge_label = torch.cat([torch.ones(test_pos_edges.size(1)), torch.zeros(test_neg_edges.size(1))], dim=0)


print("\nData split and labeled successfully:")
print("\n--- Training Data ---")
print(train_data)
print("Edge labels shape:", train_data.edge_label.shape)
print("Edge label index shape:", train_data.edge_label_index.shape)

print("\n--- Validation Data ---")
print(val_data)
print("Edge labels shape:", val_data.edge_label.shape)
print("Edge label index shape:", val_data.edge_label_index.shape)

print("\n--- Testing Data ---")
print(test_data)
print("Edge labels shape:", test_data.edge_label.shape)
print("Edge label index shape:", test_data.edge_label_index.shape)


--- Manually Splitting Edges for Link Prediction ---
Split positive edges: Train=120506, Val=15063, Test=15063
Generating negative samples for each split...

Data split and labeled successfully:

--- Training Data ---
Data(x=[29332, 4], edge_index=[2, 120506], edge_label_index=[2, 241012], edge_label=[241012])
Edge labels shape: torch.Size([241012])
Edge label index shape: torch.Size([2, 241012])

--- Validation Data ---
Data(x=[29332, 4], edge_index=[2, 120506], edge_label_index=[2, 30126], edge_label=[30126])
Edge labels shape: torch.Size([30126])
Edge label index shape: torch.Size([2, 30126])

--- Testing Data ---
Data(x=[29332, 4], edge_index=[2, 120506], edge_label_index=[2, 30126], edge_label=[30126])
Edge labels shape: torch.Size([30126])
Edge label index shape: torch.Size([2, 30126])


In [4]:
# Define GNN Encoder and Link Prediction Classifier

class GNNEncoder(torch.nn.Module):
    """GNN to learn node embeddings."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class LinkClassifier(torch.nn.Module):
    """Classifier to predict edge probability from node embeddings."""
    def forward(self, z, edge_label_index):
        # z: node embeddings [num_nodes, embedding_dim]
        # edge_label_index: edges to predict [2, num_edges_to_predict]
        edge_feat_src = z[edge_label_index[0]]
        edge_feat_dst = z[edge_label_index[1]]
        # Simple dot product as a score
        return (edge_feat_src * edge_feat_dst).sum(dim=-1)

# Initialize the models
encoder = GNNEncoder(data.num_node_features, HIDDEN_CHANNELS, EMBEDDING_DIM)
classifier = LinkClassifier()

# Move models to device if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
encoder = encoder.to(device)
classifier = classifier.to(device)
data = data.to(device)
train_data, val_data, test_data = train_data.to(device), val_data.to(device), test_data.to(device)

# Initialize optimizer
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(classifier.parameters()), lr=0.01
)
criterion = torch.nn.BCEWithLogitsLoss()

print("\nGNN Encoder and Link Classifier models defined.")


Using device: cpu

GNN Encoder and Link Classifier models defined.


In [5]:
# Training Loop and Evaluation Function

def train():
    encoder.train()
    classifier.train()
    optimizer.zero_grad()
    
    # Use all edges from the original graph for message passing
    z = encoder(train_data.x, train_data.edge_index)
    
    # Predict on the edges in the training set (positive and negative)
    out = classifier(z, train_data.edge_label_index)
    
    loss = criterion(out, train_data.edge_label)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data_split):
    encoder.eval()
    classifier.eval()
    
    # Generate embeddings using message passing edges
    z = encoder(data_split.x, data_split.edge_index)
    
    # Predict on the edges in the validation/test set
    out = classifier(z, data_split.edge_label_index)
    
    # Use AUC as the evaluation metric
    return roc_auc_score(data_split.edge_label.cpu().numpy(), out.cpu().numpy())

# --- Training ---
print("\n--- Starting GNN Training for Link Prediction ---")
best_val_auc = 0
for epoch in range(1, 201):
    loss = train()
    val_auc = test(val_data)
    test_auc = test(test_data)
    
    # Simple early stopping logic
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        print(f"✨ New best validation AUC: {val_auc:.4f}")
        # Optionally save the best model
        # torch.save(encoder.state_dict(), 'best_gnn_encoder.pt')
    
    if epoch % 10 == 0:
        print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, Test AUC: {test_auc:.4f}")

print("--- GNN Training Finished ---")


--- Starting GNN Training for Link Prediction ---
✨ New best validation AUC: 0.3108
✨ New best validation AUC: 0.3856
✨ New best validation AUC: 0.7375
✨ New best validation AUC: 0.7514
Epoch: 010, Loss: 0.8071, Val AUC: 0.5494, Test AUC: 0.5481
Epoch: 020, Loss: 0.6609, Val AUC: 0.7058, Test AUC: 0.7026
✨ New best validation AUC: 0.7767
✨ New best validation AUC: 0.8081
✨ New best validation AUC: 0.8329
✨ New best validation AUC: 0.8478
✨ New best validation AUC: 0.8529
Epoch: 030, Loss: 0.5822, Val AUC: 0.8268, Test AUC: 0.8214
✨ New best validation AUC: 0.8536
✨ New best validation AUC: 0.8638
✨ New best validation AUC: 0.8733
✨ New best validation AUC: 0.8817
✨ New best validation AUC: 0.8877
✨ New best validation AUC: 0.8910
✨ New best validation AUC: 0.8918
Epoch: 040, Loss: 0.5576, Val AUC: 0.8918, Test AUC: 0.8873
✨ New best validation AUC: 0.8921
✨ New best validation AUC: 0.8972
✨ New best validation AUC: 0.9014
✨ New best validation AUC: 0.9036
✨ New best validation AUC: 0.