In [8]:
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch_geometric.data import Data

Graph Building Function

In [9]:
def safe_float(val):
    try:
        return float(val.replace('¬∞ N','').replace('¬∞ S','').replace('¬∞ E','').replace('¬∞ W','').strip())
    except:
        return 0.0  # or some default

In [10]:
def build_graph_from_json(json_path, distance_threshold_km=50):
    data = json.load(open(json_path,'r',encoding='utf-8'))
    # Build node lists
    safari_nodes = []
    location_nodes = {}  # name -> index
    for s in data:
        safari_nodes.append(s)
        location_nodes.setdefault(s['extracted_features']['district'], None)

    # map nodes to indices
    idx = 0
    node_index = {}  # key -> idx (for safari use title/id; for location use 'LOC:Name')
    for s in safari_nodes:
        key = f"SAF:{s['id'] if 'id' in s else s['title']}"
        node_index[key] = idx; idx+=1
    for loc in location_nodes.keys():
        key = f"LOC:{loc}"
        node_index[key] = idx; idx+=1

    # features: numeric + tags multi-hot + TF-IDF of description (optional)
    # numeric arrays
    lat = []
    lon = []
    rating = []
    review_count = []
    descriptions = []
    all_tags = []
    safari_keys = []
    for s in safari_nodes:
        safari_keys.append(f"SAF:{s.get('id', s['title'])}")

        lat.append(safe_float(s['extracted_features'].get('latitude', '0')))
        lon.append(safe_float(s['extracted_features'].get('longitude', '0')))


        rating.append(float(s.get('rating', 0.0)))
        review_count.append(int(s.get('total_reviews', '0').replace(',', '')))

        descriptions.append(s.get('description',''))
        all_tags.append(s.get('tags', []))

    # normalize numeric
    num_feats = np.vstack([lat, lon, rating, np.log1p(review_count)]).T
    num_feats = StandardScaler().fit_transform(num_feats)

    # tags multi-hot
    mlb = MultiLabelBinarizer(sparse_output=False)
    tag_feats = mlb.fit_transform(all_tags)

    # description tfidf -> reduce to 50 dims
    vect = TfidfVectorizer(max_features=300)
    tfidf = vect.fit_transform(descriptions).toarray()
    # maybe reduce dimensionality if needed
    # combine features
    safari_feat = np.hstack([num_feats, tag_feats, tfidf])

    # build features for all nodes (safaris then locations)
    N = idx
    feat_dim = safari_feat.shape[1]
    X = np.zeros((N, feat_dim), dtype=np.float32)
    for i, key in enumerate(safari_keys):
        X[node_index[key]] = safari_feat[i]

    # location features: use mean of safaris in location or lat/lon
    for loc in location_nodes.keys():
        key = f"LOC:{loc}"
        # compute mean over safaris at that location
        saf_idxs = [node_index[f"SAF:{s.get('id', s['title'])}"] for s in safari_nodes if s['extracted_features']['district']
==loc]
        if saf_idxs:
            X[node_index[key]] = safari_feat[[i for i,s in enumerate(safari_nodes) if s['extracted_features']['district']
==loc]].mean(axis=0)
        else:
            X[node_index[key]] = np.zeros(feat_dim)

    # build edges: safari->location and safari-safari similarity (tag overlap)
    edges = []
    for s in safari_nodes:
        s_key = f"SAF:{s.get('id', s['title'])}"
        loc_key = f"LOC:{s['extracted_features']['district']}"
        edges.append((node_index[s_key], node_index[loc_key]))
    # safari-safari edges by tag overlap
    for i,s1 in enumerate(safari_nodes):
        for j,s2 in enumerate(safari_nodes[i+1:], start=i+1):
            if set(s1.get('tags',[])) & set(s2.get('tags',[])):
                a = node_index[f"SAF:{s1.get('id', s1['title'])}"]
                b = node_index[f"SAF:{s2.get('id', s2['title'])}"]
                edges.append((a,b)); edges.append((b,a))

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    X = torch.tensor(X, dtype=torch.float)
    data = Data(x=X, edge_index=edge_index)
    return data, node_index


GNN Training Code

In [11]:
import torch, torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import train_test_split_edges
from sklearn.metrics import roc_auc_score, average_precision_score

In [12]:
json_path = "safaris_30_items_enhanced.json"
data, node_index = build_graph_from_json(json_path)

In [13]:
# Step 3: GNN training
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges, negative_sampling

In [14]:
# 1Ô∏è‚É£ Split edges into train/val/test sets
data = train_test_split_edges(data)

# 2Ô∏è‚É£ Define a simple GCN model for link prediction
class GCNLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GCNLinkPredictor, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# 3Ô∏è‚É£ Decoder: dot product similarity
def decode(z, edge_index):
    z_src = z[edge_index[0]]
    z_dst = z[edge_index[1]]
    return (z_src * z_dst).sum(dim=1)  # dot product

# 4Ô∏è‚É£ Loss function
def compute_loss(z, pos_edge_index, neg_edge_index):
    pos_scores = decode(z, pos_edge_index)
    neg_scores = decode(z, neg_edge_index)
    labels = torch.cat([torch.ones(pos_scores.size(0)),
                        torch.zeros(neg_scores.size(0))]).to(z.device)
    scores = torch.cat([pos_scores, neg_scores])
    return F.binary_cross_entropy_with_logits(scores, labels)

# 5Ô∏è‚É£ Accuracy calculation
def compute_accuracy(z, pos_edge_index, neg_edge_index):
    pos_scores = torch.sigmoid(decode(z, pos_edge_index))
    neg_scores = torch.sigmoid(decode(z, neg_edge_index))

    preds = torch.cat([pos_scores, neg_scores])
    labels = torch.cat([torch.ones(pos_scores.size(0)),
                        torch.zeros(neg_scores.size(0))]).to(z.device)

    pred_labels = (preds > 0.5).float()
    correct = (pred_labels == labels).sum().item()
    acc = correct / labels.size(0)
    return acc

# 6Ô∏è‚É£ Model + optimizer
model = GCNLinkPredictor(data.num_features, hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 7Ô∏è‚É£ Training loop with validation accuracy
for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()

    z = model(data.x, data.train_pos_edge_index)

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=data.train_pos_edge_index.size(1)
    )

    loss = compute_loss(z, data.train_pos_edge_index, neg_edge_index)
    loss.backward()
    optimizer.step()

    # üîπ Validation step
    model.eval()
    with torch.no_grad():
        z = model(data.x, data.train_pos_edge_index)  # embeddings
        val_acc = compute_accuracy(z, data.val_pos_edge_index, data.val_neg_edge_index)

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}, Val Acc: {val_acc:.4f}")

print("‚úÖ Training finished!")



Epoch 10, Loss: 0.4906, Val Acc: 0.0000
Epoch 20, Loss: 0.3017, Val Acc: 0.5000
Epoch 30, Loss: 0.2726, Val Acc: 0.5000
Epoch 40, Loss: 0.3080, Val Acc: 0.5000
Epoch 50, Loss: 0.3085, Val Acc: 0.5000
Epoch 60, Loss: 0.3029, Val Acc: 1.0000
Epoch 70, Loss: 0.3081, Val Acc: 1.0000
Epoch 80, Loss: 0.3090, Val Acc: 0.5000
Epoch 90, Loss: 0.3911, Val Acc: 1.0000
Epoch 100, Loss: 0.3185, Val Acc: 0.0000
‚úÖ Training finished!
