# Generating the link-race dataset

For a given point A & B, find the possible paths, and for each, create a data structure containing the valid path, and options for each node along the way.

## Setup

In [51]:
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from networkx.algorithms.shortest_paths.generic import all_shortest_paths
from rdflib import URIRef
import json
import random

In [2]:
def get_neighbours(G, edge, n=3):
    """return the first n neighbours of a given node
    """
    return [i[:] for i in nx.neighbors(G, edge[0]) if (i[:] != edge[1][:]) and (i[:].startswith('http'))][:n]

In [3]:
path = "./hc_dump_latest-wdt&foaf-20211117-151212.nt"
rg = Graph().parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 2199906 triples


In [4]:
G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

networkx Graph loaded successfully with length 1187202


In [49]:
nodes_vam = [i for i in nx.nodes(G) if 'collections.vam.ac.uk' in str(i)]
nodes_sci = [i for i in nx.nodes(G) if 'collection.sciencemuseumgroup' in str(i)]

## Path finding

In [70]:
# defining point A & B
A = random.choice(nodes_vam)
B = random.choice(nodes_sci)
print(f'{A} -> {B}')

ent_a = URIRef(A)
ent_b = URIRef(B)

http://collections.vam.ac.uk/item/O916083 -> https://collection.sciencemuseumgroup.org.uk/documents/aa110114868


In [71]:
# calculating the paths
all_sps = all_shortest_paths(G, ent_a, ent_b)
path_graphs = [nx.path_graph(sp) for sp in all_sps]

NetworkXNoPath: Target https://collection.sciencemuseumgroup.org.uk/documents/aa110114868 cannot be reachedfrom given sources

In [None]:
# for each found path
# list of dicts, where key = each step of the path
# within each dict is next-best-step, and 3 (random) connected 'neighbour' nodes
paths_json = {}
for ix, path in enumerate(path_graphs): 
    path_edges = [i for i in path.edges()]

    path_json = []
    for edge in path_edges:
        path_json.append({
            edge[0][:]: {
            'next-best-step': edge[1][:],
            'neighbours': get_neighbours(G, edge),
            }}
        )
    path_json.append({
        path_edges[-1][1][:]: {
            'next-best-step': 'END',
            'neighbours': [],
        }
    })
    paths_json[f'path_{ix}'] = path_json

In [None]:
# paths_json

{'path_0': [{'http://collections.vam.ac.uk/item/O1389838': {'next-best-step': 'https://api.vam.ac.uk/v2/objects/search?id_technique=AAT53271',
    'neighbours': ['https://api.vam.ac.uk/v2/objects/search?id_person=AUTH320983',
     'https://api.vam.ac.uk/v2/objects/search?id_person=AUTH316738',
     'https://api.vam.ac.uk/v2/objects/search?id_technique=AAT131119']}},
  {'https://api.vam.ac.uk/v2/objects/search?id_technique=AAT53271': {'next-best-step': 'http://collections.vam.ac.uk/item/O1519245',
    'neighbours': ['http://collections.vam.ac.uk/item/O690456',
     'http://collections.vam.ac.uk/item/O701261',
     'http://collections.vam.ac.uk/item/O689095']}},
  {'http://collections.vam.ac.uk/item/O1519245': {'next-best-step': 'made in england',
    'neighbours': ['https://api.vam.ac.uk/v2/objects/search?id_organisation=AUTH355221',
     'https://api.vam.ac.uk/v2/objects/search?id_organisation=AUTH355220',
     'https://api.vam.ac.uk/v2/objects/search?id_technique=AAT53271']}},
  {'mad

In [None]:
with open('paths.json', 'w') as paths_file:
    json.dump(paths_json, paths_file, indent=2, sort_keys=True)