In [1]:
# verify sync between Wikidata/ACMI and identify exceptions.

import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

def query_entities(entity_type):

    ''' Compare entities from both datasets. '''

    tsv_path = pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv'
    acmi_api = pandas.read_csv(tsv_path / f'{entity_type}.tsv', delimiter='\t')
    acmi_side = set([f'{entity_type}/'+str(x) for x in acmi_api.id.unique()])

    query = '''
        select ?wikidata_id ?acmi_id
        where { 
            ?wikidata_id wdt:P7003 ?acmi_id . 
            filter(regex(str(?acmi_id), "'''+entity_type+'''"))
            } '''

    wikidata = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    wikidata_side = set(wikidata.acmi_id.unique())
    
    print('~')
    print(len(set.intersection(acmi_side, wikidata_side)), f'{entity_type} are synced.')
    print(len(acmi_side - wikidata_side), f'{entity_type} only ACMI side.')
    print(len(wikidata_side - acmi_side), f'{entity_type} only Wikidata side.')

query_entities('creators')
query_entities('works')

~
14964 creators are synced.
5622 creators only ACMI side.
2173 creators only Wikidata side.
~
7044 works are synced.
36018 works only ACMI side.
62 works only Wikidata side.
