# Generating the link-race dataset

1. For a given point A & B, find the possible paths, and for each, create a data structure containing the valid path, and options for each node along the way.
1. Generate n paths for random A & B objects.

## Setup (run once per session)

This section loads the triples data into memory, it should take about 2 mins to run, but is only necessary when running this for the first time each session

In [1]:
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from rdflib import URIRef
import json
import random
from datetime import datetime
import requests
import api

In [2]:
def neighbours(G, edge, n=3):
    """return the first n neighbours of a given node
    """
    try:
        return [i[:] for i in nx.neighbors(G, edge[0]) if (i[:] != edge[1][:]) and (i[:].startswith('http'))][:n]
    except TypeError:
        return []

In [3]:
def get_labels(entities):
    url = "https://d0rgkq.deta.dev/labels"
    payload = json.dumps({
      "uris": entities
    })
    headers = {
      'Content-Type': 'application/json'
    }

    return requests.post(url, headers=headers, data=payload).json()

# get_labels(['http://collections.vam.ac.uk/item/O1254669'])

In [4]:
def path_json(G, A, B):
    """
    For a Graph (G), and start (A) and end (B)
    find a path
    for each step in path, find nextBestStep, and 3 (random) connected 'neighbour' nodes
    """
    ent_a = URIRef(A)
    ent_b = URIRef(B)

    # calculating the paths
    all_sps = nx.all_shortest_paths(G, ent_a, ent_b)
    path_graphs = [nx.path_graph(sp) for sp in all_sps]
    path = path_graphs[0]

    # for first found path
    # list of dicts, where key = each step of the path
    # within each dict is nextBestStep, and 3 (random) connected 'neighbour' nodes

    path_edges = [i for i in path.edges()]

    path_json = []
    for edge in path_edges:
        neighbours_list = neighbours(G, edge)
        if not neighbours_list: neighbours_list = [i[:] for i in random.sample(list(G.nodes()), 3)]
        path_json.append({
            'url': edge[0][:],
            'nextBestStep': edge[1][:],
            'neighbours': neighbours_list,
            }
        )
    path_json.append({
        'url': path_edges[-1][1][:], 
        'nextBestStep': 'END',
        'neighbours': [],
        
    })
    
    return path_json

In [5]:
def unique_nodes(paths_json):
    all_paths_nodes = []
    for path in paths_json:
        for node in path:
            node_label = node['url']
            neighbours = node['neighbours']
            nextBestStep = node['nextBestStep']
            all_paths_nodes.append([node_label] + neighbours + [nextBestStep])
    all_paths_nodes = set([item for sublist in all_paths_nodes for item in sublist])
    return list(all_paths_nodes)

In [6]:
def write_json(obj, name):
    with open(f'{name}.json', 'w') as paths_file:
        json.dump(obj, paths_file, indent=2)

In [7]:
path = "./hc_dump_latest-filtered-20211117-114506.nt"
rg = Graph().parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 2030542 triples


In [8]:
G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

networkx Graph loaded successfully with length 758632


In [9]:
nodes_vam = []
nodes_sci = []
for i in nx.nodes(G):
    if 'collections.vam.ac.uk' in str(i):
        nodes_vam.append(i)
    elif 'collection.sciencemuseumgroup' in str(i):
        nodes_sci.append(i)

## Path finding

In [10]:
# # n random paths
# n = 10

# paths_json = []
# for i in range(n):
#     A = random.choice(nodes_vam)
#     B = random.choice(nodes_sci)
#     try:
#         p = path_json(G, A, B)
#         print(f'Path {i} - SUCCESS - {A} -> {B}')
#     except:
#         print(f'Path {i} - FAIL - {A} -> {B}')
#     paths_json.append(p)

In [11]:
ABs = [
  ('http://collections.vam.ac.uk/item/O755026', 'https://collection.sciencemuseumgroup.org.uk/objects/co8203386'),
  ('http://collections.vam.ac.uk/item/O675659', 'https://collection.sciencemuseumgroup.org.uk/objects/co85549'),
  ('http://collections.vam.ac.uk/item/O229512', 'https://collection.sciencemuseumgroup.org.uk/objects/co113666'),
  ('http://collections.vam.ac.uk/item/O1267685', 'https://collection.sciencemuseumgroup.org.uk/objects/co412022'),
  ('http://collections.vam.ac.uk/item/O354503', 'https://collection.sciencemuseumgroup.org.uk/objects/co107111'),
  ('http://collections.vam.ac.uk/item/O1114035', 'https://collection.sciencemuseumgroup.org.uk/objects/co8414500'),
  ('http://collections.vam.ac.uk/item/O1273343', 'https://collection.sciencemuseumgroup.org.uk/objects/co159111'),
  ('http://collections.vam.ac.uk/item/O1346307', 'https://collection.sciencemuseumgroup.org.uk/documents/aa110115036'),
  ('http://collections.vam.ac.uk/item/O224862', 'https://collection.sciencemuseumgroup.org.uk/objects/co181351'),
  ('http://collections.vam.ac.uk/item/O590605', 'https://collection.sciencemuseumgroup.org.uk/objects/co26352'),
]

paths_json = []
for i, (A, B) in enumerate(ABs):
  try:
      p = path_json(G, A, B)
      print(f'Path {i} - SUCCESS - {A} -> {B}')
  except:
      print(f'Path {i} - FAIL - {A} -> {B}')
  paths_json.append(p)


Path 0 - SUCCESS - http://collections.vam.ac.uk/item/O755026 -> https://collection.sciencemuseumgroup.org.uk/objects/co8203386
Path 1 - SUCCESS - http://collections.vam.ac.uk/item/O675659 -> https://collection.sciencemuseumgroup.org.uk/objects/co85549
Path 2 - SUCCESS - http://collections.vam.ac.uk/item/O229512 -> https://collection.sciencemuseumgroup.org.uk/objects/co113666
Path 3 - SUCCESS - http://collections.vam.ac.uk/item/O1267685 -> https://collection.sciencemuseumgroup.org.uk/objects/co412022
Path 4 - SUCCESS - http://collections.vam.ac.uk/item/O354503 -> https://collection.sciencemuseumgroup.org.uk/objects/co107111
Path 5 - SUCCESS - http://collections.vam.ac.uk/item/O1114035 -> https://collection.sciencemuseumgroup.org.uk/objects/co8414500
Path 6 - SUCCESS - http://collections.vam.ac.uk/item/O1273343 -> https://collection.sciencemuseumgroup.org.uk/objects/co159111
Path 7 - SUCCESS - http://collections.vam.ac.uk/item/O1346307 -> https://collection.sciencemuseumgroup.org.uk/docu

In [12]:
ts = datetime.now().strftime("%Y%m%d-%H%M%S")

for path in paths_json:
    incr = 100 / (len(path) - 1)
    for ix, step in enumerate(path):
        step['progress'] = ix * incr

with open(f'{ts}-paths.json', 'w') as paths_file:
    json.dump(paths_json, paths_file, indent=2)

In [14]:
nodes_json = {}
for node in unique_nodes(paths_json):
    id = node.rsplit('/')[-1]
    if 'sciencemuseumgroup.org.uk/objects' in node:
        node_meta = api.scimu('objects', id)
    elif 'sciencemuseumgroup.org.uk/documents' in node:
        node_meta = api.scimu('documents', id)
    elif 'sciencemuseumgroup.org.uk/people' in node:
        node_meta = api.scimu('people', id)
    elif 'collections.vam.ac.uk' in node:
        node_meta = api.vanda(id)
    elif 'https://api.vam.ac.uk/v2/objects/search?' in node:
        node_meta = api.vanda_obj(node)
    elif 'wikidata.org' in node:
        node_meta = api.wikidata(id)
    else:
        node_meta = {
            'title': '',
            'img': '',
            'description': ''
        }
    if node_meta['title'] == '':
        node_meta['title'] == node
    if node_meta['img'] == '':
        node_meta['img'] = 'https://i.pinimg.com/originals/17/fc/a3/17fca30c76af6019df655044072e21a5.jpg'
    nodes_json[node] = node_meta

    with open(f'{ts}-nodes.json', 'w') as nodes_file:
        json.dump(nodes_json, nodes_file, indent=2, sort_keys=True)