# Generating the link-race dataset

For a given point A & B, find the possible paths, and for each, create a data structure containing the valid path, and options for each node along the way.

## Setup

In [1]:
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from networkx.algorithms.shortest_paths.generic import all_shortest_paths
from rdflib import URIRef
import json
import random
from datetime import datetime
import requests

In [2]:
def neighbours(G, edge, n=3):
    """return the first n neighbours of a given node
    """
    try:
        return [i[:] for i in nx.neighbors(G, edge[0]) if (i[:] != edge[1][:]) and (i[:].startswith('http'))][:n]
    except TypeError:
        return []

In [3]:
def similarity(a, b):
    url = 'https://d0rgkq.deta.dev/distance'
    data = {
        'entity_a': a,
        'entity_b': b,
    }
    return requests.post(url, json=data).json()

# similarity(
#     "http://collections.vam.ac.uk/item/O1389838",
#     "https://collection.sciencemuseumgroup.org.uk/objects/co102121"
# )

In [4]:
def get_labels(entities):
    url = "https://d0rgkq.deta.dev/labels"
    payload = json.dumps({
      "uris": entities
    })
    headers = {
      'Content-Type': 'application/json'
    }

    return requests.post(url, headers=headers, data=payload).json()

# get_labels(['http://collections.vam.ac.uk/item/O1254669'])

In [5]:
path = "./hc_dump_latest-filtered-20211117-114506.nt"
rg = Graph().parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 2030542 triples


In [6]:
G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

networkx Graph loaded successfully with length 758632


In [7]:
nodes_vam = []
nodes_sci = []
for i in nx.nodes(G):
    if 'collections.vam.ac.uk' in str(i):
        nodes_vam.append(i)
    elif 'collection.sciencemuseumgroup' in str(i):
        nodes_sci.append(i)

## Path finding

In [8]:
# defining point A & B
# A = random.choice(nodes_vam)
# B = random.choice(nodes_sci)

A = 'https://collection.sciencemuseumgroup.org.uk/objects/co91945'
B = 'https://collection.sciencemuseumgroup.org.uk/objects/co29819'

print(f'{A} -> {B}')

ent_a = URIRef(A)
ent_b = URIRef(B)

https://collection.sciencemuseumgroup.org.uk/objects/co91945 -> https://collection.sciencemuseumgroup.org.uk/objects/co29819


In [9]:
# calculating the paths
all_sps = all_shortest_paths(G, ent_a, ent_b)
path_graphs = [nx.path_graph(sp) for sp in all_sps]

print(f'Found {len(path_graphs)} paths')

Found 16 paths


In [10]:
# for top 20 found path
# list of dicts, where key = each step of the path
# within each dict is nextBestStep, and 3 (random) connected 'neighbour' nodes
paths_json = []
for ix, path in enumerate(path_graphs[:20]): 
    path_edges = [i for i in path.edges()]

    path_json = []
    for edge in path_edges:
        neighbours_list = neighbours(G, edge)
        if not neighbours_list: neighbours_list = [i[:] for i in random.sample(list(G.nodes()), 3)]
        path_json.append({
            'url': edge[0][:],
            'nextBestStep': edge[1][:],
            'neighbours': neighbours_list,
            }
        )
    path_json.append({
        path_edges[-1][1][:]: {
            'nextBestStep': 'END',
            'neighbours': [],
        }
    })
    paths_json.append(path_json)

In [11]:
paths_json

[[{'url': 'https://collection.sciencemuseumgroup.org.uk/objects/co91945',
   'nextBestStep': 'http://www.wikidata.org/entity/Q84',
   'neighbours': ['charlotte bawden',
    'https://collection.sciencemuseumgroup.org.uk/objects/co131616',
    'm marley']},
  {'url': 'http://www.wikidata.org/entity/Q84',
   'nextBestStep': 'https://collection.sciencemuseumgroup.org.uk/documents/aa110017832',
   'neighbours': ['https://collection.sciencemuseumgroup.org.uk/objects/co422877',
    'http://collections.vam.ac.uk/item/O77348',
    'https://collection.sciencemuseumgroup.org.uk/objects/co8644745']},
  {'url': 'https://collection.sciencemuseumgroup.org.uk/documents/aa110017832',
   'nextBestStep': 'https://collection.sciencemuseumgroup.org.uk/people/ap12850',
   'neighbours': ['http://www.wikidata.org/entity/Q84',
    'https://collection.sciencemuseumgroup.org.uk/people/cp26280',
    'https://collection.sciencemuseumgroup.org.uk/people/ap13789']},
  {'url': 'https://collection.sciencemuseumgroup.o

In [12]:
ts = datetime.now().strftime("%Y%m%d-%H%M%S")

with open(f'{ts}-paths.json', 'w') as paths_file:
    json.dump(paths_json, paths_file, indent=2, sort_keys=True)