In [1]:
import json
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch_geometric.data import Data

Graph Building Function

In [2]:
def safe_float(val):
    try:
        return float(val.replace('° N','').replace('° S','').replace('° E','').replace('° W','').strip())
    except:
        return 0.0  # or some default

In [3]:
def build_graph_from_json(json_path, distance_threshold_km=50):
    data = json.load(open(json_path,'r',encoding='utf-8'))
    # Build node lists
    safari_nodes = []
    location_nodes = {}  # name -> index
    for s in data:
        safari_nodes.append(s)
        location_nodes.setdefault(s['extracted_features']['district'], None)

    # map nodes to indices
    idx = 0
    node_index = {}  # key -> idx (for safari use title/id; for location use 'LOC:Name')
    for s in safari_nodes:
        key = f"SAF:{s['id'] if 'id' in s else s['title']}"
        node_index[key] = idx; idx+=1
    for loc in location_nodes.keys():
        key = f"LOC:{loc}"
        node_index[key] = idx; idx+=1

    # features: numeric + tags multi-hot + TF-IDF of description (optional)
    # numeric arrays
    lat = []
    lon = []
    rating = []
    review_count = []
    descriptions = []
    all_tags = []
    safari_keys = []
    for s in safari_nodes:
        safari_keys.append(f"SAF:{s.get('id', s['title'])}")

        lat.append(safe_float(s['extracted_features'].get('latitude', '0')))
        lon.append(safe_float(s['extracted_features'].get('longitude', '0')))


        rating.append(float(s.get('rating', 0.0)))
        review_count.append(int(s.get('total_reviews', '0').replace(',', '')))

        descriptions.append(s.get('description',''))
        all_tags.append(s.get('tags', []))

    # normalize numeric
    num_feats = np.vstack([lat, lon, rating, np.log1p(review_count)]).T
    num_feats = StandardScaler().fit_transform(num_feats)

    # tags multi-hot
    mlb = MultiLabelBinarizer(sparse_output=False)
    tag_feats = mlb.fit_transform(all_tags)

    # description tfidf -> reduce to 50 dims
    vect = TfidfVectorizer(max_features=300)
    tfidf = vect.fit_transform(descriptions).toarray()
    # maybe reduce dimensionality if needed
    # combine features
    safari_feat = np.hstack([num_feats, tag_feats, tfidf])

    # build features for all nodes (safaris then locations)
    N = idx
    feat_dim = safari_feat.shape[1]
    X = np.zeros((N, feat_dim), dtype=np.float32)
    for i, key in enumerate(safari_keys):
        X[node_index[key]] = safari_feat[i]

    # location features: use mean of safaris in location or lat/lon
    for loc in location_nodes.keys():
        key = f"LOC:{loc}"
        # compute mean over safaris at that location
        saf_idxs = [node_index[f"SAF:{s.get('id', s['title'])}"] for s in safari_nodes if s['extracted_features']['district']
==loc]
        if saf_idxs:
            X[node_index[key]] = safari_feat[[i for i,s in enumerate(safari_nodes) if s['extracted_features']['district']
==loc]].mean(axis=0)
        else:
            X[node_index[key]] = np.zeros(feat_dim)

    # build edges: safari->location and safari-safari similarity (tag overlap)
    edges = []
    for s in safari_nodes:
        s_key = f"SAF:{s.get('id', s['title'])}"
        loc_key = f"LOC:{s['extracted_features']['district']}"
        edges.append((node_index[s_key], node_index[loc_key]))
    # safari-safari edges by tag overlap
    for i,s1 in enumerate(safari_nodes):
        for j,s2 in enumerate(safari_nodes[i+1:], start=i+1):
            if set(s1.get('tags',[])) & set(s2.get('tags',[])):
                a = node_index[f"SAF:{s1.get('id', s1['title'])}"]
                b = node_index[f"SAF:{s2.get('id', s2['title'])}"]
                edges.append((a,b)); edges.append((b,a))

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    X = torch.tensor(X, dtype=torch.float)
    data = Data(x=X, edge_index=edge_index)
    return data, node_index
