# Generating the link-race dataset

For a given point A & B, find the possible paths, and for each, create a data structure containing the valid path, and options for each node along the way.

## Setup

In [1]:
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from networkx.algorithms.shortest_paths.generic import all_shortest_paths
from rdflib import URIRef
import json
import random
from datetime import datetime
import requests

In [2]:
def neighbours(G, edge, n=3):
    """return the first n neighbours of a given node
    """
    try:
        return [i[:] for i in nx.neighbors(G, edge[0]) if (i[:] != edge[1][:]) and (i[:].startswith('http'))][:n]
    except TypeError:
        return []

In [3]:
def similarity(a, b):
    url = 'https://d0rgkq.deta.dev/distance'
    data = {
        'entity_a': a,
        'entity_b': b,
    }
    return requests.post(url, json=data).json()

# similarity(
#     "http://collections.vam.ac.uk/item/O1389838",
#     "https://collection.sciencemuseumgroup.org.uk/objects/co102121"
# )

In [4]:
def get_labels(entities):
    url = "https://d0rgkq.deta.dev/labels"
    payload = json.dumps({
      "uris": entities
    })
    headers = {
      'Content-Type': 'application/json'
    }

    return requests.post(url, headers=headers, data=payload).json()

# get_labels(['http://collections.vam.ac.uk/item/O1254669'])

In [5]:
path = "./hc_dump_latest-filtered-20211117-114506.nt"
rg = Graph().parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 2030542 triples


In [6]:
G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

networkx Graph loaded successfully with length 758632


In [7]:
nodes_vam = []
nodes_sci = []
for i in nx.nodes(G):
    if 'collections.vam.ac.uk' in str(i):
        nodes_vam.append(i)
    elif 'collection.sciencemuseumgroup' in str(i):
        nodes_sci.append(i)

## Path finding

In [8]:
# defining point A & B
# A = random.choice(nodes_vam)
# B = random.choice(nodes_sci)

A = 'https://collection.sciencemuseumgroup.org.uk/objects/co91945'
B = 'https://collection.sciencemuseumgroup.org.uk/objects/co29819'

print(f'{A} -> {B}')

ent_a = URIRef(A)
ent_b = URIRef(B)

https://collection.sciencemuseumgroup.org.uk/objects/co91945 -> https://collection.sciencemuseumgroup.org.uk/objects/co29819


In [9]:
# calculating the paths
all_sps = all_shortest_paths(G, ent_a, ent_b)
path_graphs = [nx.path_graph(sp) for sp in all_sps]

print(f'Found {len(path_graphs)} paths')

Found 16 paths


In [10]:
# for top 20 found path
# list of dicts, where key = each step of the path
# within each dict is nextBestStep, and 3 (random) connected 'neighbour' nodes
paths_json = []
for ix, path in enumerate(path_graphs[:20]): 
    path_edges = [i for i in path.edges()]

    path_json = []
    for edge in path_edges:
        path_json.append({
            edge[0][:]: {
            'nextBestStep': edge[1][:],
            'neighbours': neighbours(G, edge),
            }}
        )
    path_json.append({
        path_edges[-1][1][:]: {
            'nextBestStep': 'END',
            'neighbours': [],
        }
    })
    paths_json.append(path_json)

In [11]:
paths_json

[[{'https://collection.sciencemuseumgroup.org.uk/objects/co91945': {'nextBestStep': 'http://www.wikidata.org/entity/Q84',
    'neighbours': []}},
  {'http://www.wikidata.org/entity/Q84': {'nextBestStep': 'https://collection.sciencemuseumgroup.org.uk/objects/co430567',
    'neighbours': ['http://collections.vam.ac.uk/item/O1148335',
     'https://collection.sciencemuseumgroup.org.uk/objects/co135595',
     'https://collection.sciencemuseumgroup.org.uk/objects/co421653']}},
  {'https://collection.sciencemuseumgroup.org.uk/objects/co430567': {'nextBestStep': 'https://collection.sciencemuseumgroup.org.uk/people/ap12850',
    'neighbours': ['http://www.wikidata.org/entity/Q84',
     'https://collection.sciencemuseumgroup.org.uk/people/cp31792']}},
  {'https://collection.sciencemuseumgroup.org.uk/people/ap12850': {'nextBestStep': 'https://collection.sciencemuseumgroup.org.uk/objects/co29819',
    'neighbours': ['https://collection.sciencemuseumgroup.org.uk/objects/co30607',
     'https://col

In [12]:
all_paths_nodes = []
for path in paths_json:
    for node in path:
        node_label = list(node.keys())[0]
        neighbours = node[node_label]['neighbours']
        nextBestStep = node[node_label]['nextBestStep']
        all_paths_nodes.append([node_label] + neighbours + [nextBestStep])
all_paths_nodes = set([item for sublist in all_paths_nodes for item in sublist])
all_paths_nodes = {i: {'caption': '', 'img': ''} for i in all_paths_nodes if i != 'END'}

In [13]:
ts = datetime.now().strftime("%Y%m%d-%H%M%S")

with open(f'{ts}-paths.json', 'w') as paths_file:
    json.dump(paths_json, paths_file, indent=2, sort_keys=True)

with open(f'{ts}-pathnodes.json', 'w') as paths_file:
    json.dump(all_paths_nodes, paths_file, indent=2, sort_keys=True)