# Generating the link-race dataset

For a given point A & B, find the possible paths, and for each, create a data structure containing the valid path, and options for each node along the way.

## Setup

In [1]:
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from networkx.algorithms.shortest_paths.generic import all_shortest_paths
from rdflib import URIRef
import json
import random
from datetime import datetime
import requests

In [2]:
def neighbours(G, edge, n=3):
    """return the first n neighbours of a given node
    """
    return [i[:] for i in nx.neighbors(G, edge[0]) if (i[:] != edge[1][:]) and (i[:].startswith('http'))][:n]

In [3]:
def similarity(a, b):
    url = 'https://d0rgkq.deta.dev/distance'
    data = {
        'entity_a': a,
        'entity_b': b,
    }
    return requests.post(url, json=data).json()

# similarity(
#     "http://collections.vam.ac.uk/item/O1389838",
#     "https://collection.sciencemuseumgroup.org.uk/objects/co102121"
# )

In [4]:
def get_labels(entities):
    url = "https://d0rgkq.deta.dev/labels"
    payload = json.dumps({
      "uris": entities
    })
    headers = {
      'Content-Type': 'application/json'
    }

    return requests.post(url, headers=headers, data=payload).json()

# get_labels(['http://collections.vam.ac.uk/item/O1254669'])

In [5]:
path = "./hc_dump_latest-filtered-20211117-114506.nt"
rg = Graph().parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 2030542 triples


In [6]:
G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

networkx Graph loaded successfully with length 758632


In [7]:
nodes_vam = []
nodes_sci = []
for i in nx.nodes(G):
    if 'collections.vam.ac.uk' in str(i):
        nodes_vam.append(i)
    elif 'collection.sciencemuseumgroup' in str(i):
        nodes_sci.append(i)

## Path finding

In [8]:
# defining point A & B
# A = random.choice(nodes_vam)
# B = random.choice(nodes_sci)

A = 'http://collections.vam.ac.uk/item/O1389838'
B = 'https://collection.sciencemuseumgroup.org.uk/objects/co102121'

print(f'{A} -> {B}')

ent_a = URIRef(A)
ent_b = URIRef(B)

http://collections.vam.ac.uk/item/O1389838 -> https://collection.sciencemuseumgroup.org.uk/objects/co102121


In [9]:
# calculating the paths
all_sps = all_shortest_paths(G, ent_a, ent_b)
path_graphs = [nx.path_graph(sp) for sp in all_sps]

print(f'Found {len(path_graphs)} paths')

Found 50 paths


In [10]:
# for top 20 found path
# list of dicts, where key = each step of the path
# within each dict is next-best-step, and 3 (random) connected 'neighbour' nodes
paths_json = {}
for ix, path in enumerate(path_graphs[:20]): 
    path_edges = [i for i in path.edges()]

    path_json = []
    for edge in path_edges:
        path_json.append({
            edge[0][:]: {
            'next-best-step': edge[1][:],
            'neighbours': neighbours(G, edge),
            }}
        )
    path_json.append({
        path_edges[-1][1][:]: {
            'next-best-step': 'END',
            'neighbours': [],
        }
    })
    paths_json[f'path_{ix}'] = path_json

In [11]:
{k: len(v) for k, v in paths_json.items()}

{'path_0': 5,
 'path_1': 5,
 'path_2': 5,
 'path_3': 5,
 'path_4': 5,
 'path_5': 5,
 'path_6': 5,
 'path_7': 5,
 'path_8': 5,
 'path_9': 5,
 'path_10': 5,
 'path_11': 5,
 'path_12': 5,
 'path_13': 5,
 'path_14': 5,
 'path_15': 5,
 'path_16': 5,
 'path_17': 5,
 'path_18': 5,
 'path_19': 5}

In [12]:
paths_json

{'path_0': [{'http://collections.vam.ac.uk/item/O1389838': {'next-best-step': 'http://www.wikidata.org/entity/Q21',
    'neighbours': ['http://www.wikidata.org/entity/Q262755',
     'http://www.wikidata.org/entity/Q142',
     'http://www.wikidata.org/entity/Q130822']}},
  {'http://www.wikidata.org/entity/Q21': {'next-best-step': 'https://api.vam.ac.uk/v2/objects/search?id_person=N10459',
    'neighbours': ['https://collection.sciencemuseumgroup.org.uk/objects/co69011',
     'https://collection.sciencemuseumgroup.org.uk/objects/co8084897',
     'https://collection.sciencemuseumgroup.org.uk/objects/co433113']}},
  {'https://api.vam.ac.uk/v2/objects/search?id_person=N10459': {'next-best-step': 'http://www.wikidata.org/entity/Q124010',
    'neighbours': ['http://www.wikidata.org/entity/Q23220',
     'http://www.wikidata.org/entity/Q21',
     'http://www.wikidata.org/entity/Q84']}},
  {'http://www.wikidata.org/entity/Q124010': {'next-best-step': 'https://collection.sciencemuseumgroup.org.uk

In [13]:
# ts = datetime.now().strftime("%Y%m%d-%H%M%S")

# with open(f'paths-{ts}.json', 'w') as paths_file:
#     json.dump(paths_json, paths_file, indent=2, sort_keys=True)