In [1]:
# pull and compare entity relationships, starting with director statements.

import json
import pandas
import pathlib
import pydash
import requests
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

# first derive all extant acmi entities, wikidata side.
# in the future with full sync, this filter would not be required.

query = ''' 
    select ?acmi where { 
        ?wd wdt:P7003 ?acmi 
    } '''

extant = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

print(len(extant))
extant.head()

24220


Unnamed: 0,acmi
0,creators/80957
1,creators/78375
2,creators/54792
3,creators/62487
4,creators/32701


In [2]:
# acmi-side statements.

acmi_path = pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'json' / 'works'
acmi_api = [f for f in acmi_path.iterdir() if f.suffix == '.json' and 'index' not in f.name]
acmi_roles = pandas.DataFrame(columns=['work', 'role', 'creator'])
for acmi_file in tqdm.tqdm(acmi_api):
    with open(acmi_file, encoding='utf-8') as acmi_data:
        acmi_data = json.load(acmi_data)
    if 'creators_primary' in acmi_data:
        for y in acmi_data['creators_primary']:
            if y['role'] == 'director':
                acmi_roles.loc[len(acmi_roles)] = [
                    (f"works/{acmi_data['id']}"), ('director'), (f"creators/{y['creator_id']}")]

acmi_roles = acmi_roles.loc[acmi_roles.work.isin(list(extant.acmi.unique()))]
acmi_roles = acmi_roles.loc[acmi_roles.creator.isin(list(extant.acmi.unique()))]

acmi_roles['acmi'] = True
print(len(acmi_roles))
acmi_roles.head()

100%|██████████| 43062/43062 [00:55<00:00, 772.47it/s] 

5766





Unnamed: 0,work,role,creator,acmi
4,works/94858,director,creators/4863,True
14,works/116035,director,creators/82975,True
15,works/93364,director,creators/31457,True
18,works/88178,director,creators/66844,True
21,works/116465,director,creators/83321,True


In [3]:
# wikidata-side statements.

query = ''' 
    select ?work_acmi ?director_acmi where {
        ?work wdt:P57 ?director .
        ?work wdt:P7003 ?work_acmi .
        ?director wdt:P7003 ?director_acmi .
        } '''

wikidata_roles = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
wikidata_roles['role'] = 'director'
wikidata_roles = wikidata_roles[['work_acmi', 'role', 'director_acmi']].rename(
    columns={'work_acmi':'work', 'director_acmi':'creator'})
wikidata_roles['wikidata'] = True

wikidata_roles = wikidata_roles.loc[wikidata_roles.work.isin(list(extant.acmi.unique()))]
wikidata_roles = wikidata_roles.loc[wikidata_roles.creator.isin(list(extant.acmi.unique()))]

print(len(wikidata_roles))
wikidata_roles.head()

7130


Unnamed: 0,work,role,creator,wikidata
0,works/88726,director,creators/10138,True
1,works/74464,director,creators/71814,True
2,works/115860,director,creators/25145,True
3,works/116173,director,creators/28528,True
4,works/90542,director,creators/23050,True


In [4]:
# merge and report numbers.

report = pandas.merge(acmi_roles, wikidata_roles, on=['work', 'role', 'creator'], how='outer')
print(len(report), 'total director statements.')
print('~')
print(len(report.loc[report.acmi.isin([True]) & report.wikidata.isin([True])]), 'shared director statements.')
print(len(report.loc[report.acmi.isin([True]) & ~report.wikidata.isin([True])]), 'ACMI-only director statements.')
print(len(report.loc[~report.acmi.isin([True]) & report.wikidata.isin([True])]), 'Wikidata-only director statements.')

7508 total director statements.
~
5389 shared director statements.
377 ACMI-only director statements.
1742 Wikidata-only director statements.
