In [1]:
# # preprocess data sources.

# parse acmi api to pull dataframe.

import json
import numpy
import pandas
import pathlib
import pydash
import requests
import tqdm

acmi_path = pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'json' / 'works'
acmi_files = [filename for filename in acmi_path.iterdir() if filename.suffix == '.json' and 'index' not in filename.name]

acmi_dataframe = pandas.DataFrame(columns=[
    'acmi_work_id', 'acmi_work_title', 'acmi_work_wikidata', 
    'acmi_creator_id', 'acmi_creator_name', 'acmi_creator_wikidata'])
    
for acmi_file in tqdm.tqdm(acmi_files):

    with open(acmi_file, encoding='utf-8') as acmi_data:
        acmi_data = json.load(acmi_data)

    if 'id' in acmi_data: 
        acmi_work_id = acmi_data['id']

        acmi_work_title = ''
        if 'title' in acmi_data:
            acmi_work_title = acmi_data['title']

        acmi_work_wikidata = ''
        if 'external_references' in acmi_data:
            for external_reference in acmi_data['external_references']:
                if pydash.get(external_reference, 'source.name') == 'Wikidata':
                    acmi_work_wikidata = external_reference['source_identifier']

        if 'creators_primary' in acmi_data:
            for y in acmi_data['creators_primary']:
                acmi_creator_id = y['creator_id']

                acmi_creator_name = ''
                if 'name' in y:
                    acmi_creator_name = y['name']

                acmi_creator_wikidata = ''
                if 'creator_wikidata_id' in y:
                    if y['creator_wikidata_id']:
                        acmi_creator_wikidata = y['creator_wikidata_id']

                acmi_dataframe.loc[len(acmi_dataframe)] = [
                    (acmi_work_id), (acmi_work_title), (acmi_work_wikidata), 
                    (acmi_creator_id), (acmi_creator_name), (acmi_creator_wikidata)]
        
acmi_data_path = pathlib.Path.cwd().parents[0] / 'data' / 'acmi_data.parquet'
acmi_data_path.parents[0].mkdir(exist_ok=True)
acmi_dataframe.to_parquet(acmi_data_path, index=False)

100%|██████████| 42901/42901 [06:02<00:00, 118.30it/s]


In [2]:
# pull wikidata -> acmi links from wikidata.

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

query = ''' select ?wikidata_id ?wikidata_acmi_id where { ?wikidata_id wdt:P7003 ?wikidata_acmi_id } '''
wikidata_links = sparql_query(query, 'https://query.wikidata.org/sparql')
wikidata_links['wikidata_id'] = wikidata_links['wikidata_id'].str.split('/').str[-1]
wikidata_link_path = pathlib.Path.cwd().parents[0] / 'data' / 'wikidata_link_data.parquet'
wikidata_links.to_parquet(wikidata_link_path, index=False)

print(len(wikidata_links))
wikidata_links.head()

9272


Unnamed: 0,wikidata_id,wikidata_acmi_id
0,Q3822468,works/85037
1,Q703727,creators/72597
2,Q172837,works/115714
3,Q6940276,works/70682
4,Q189694,creators/59635


In [3]:
# pull wikidata creator data from linked works.

link_path = pathlib.Path.cwd().parents[0] / 'data' / 'wikidata_link_data.parquet'
link_data = pandas.read_parquet(link_path)
link_data = link_data.loc[link_data.wikidata_acmi_id.str.contains('works', na=False)]

wikidata_data = pandas.DataFrame()
for chunk in numpy.array_split(link_data.wikidata_id.unique(), 10):
    chunk_array = 'wd:'+' wd:'.join([x for x in chunk])
    query = """
        select distinct ?work ?workLabel ?creator ?creatorLabel where {
            values ?work {"""+chunk_array+"""}
            ?work ?prop ?creator . 
            ?creator wdt:P31 wd:Q5 .
            service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
            } """
    wikidata_data = pandas.concat([wikidata_data, sparql_query(query, "https://query.wikidata.org/sparql")])

for x in ['work', 'creator']:
    wikidata_data[x] = wikidata_data[x].str.split('/').str[-1]

wikidata_data = wikidata_data[['work', 'workLabel', 'creator', 'creatorLabel']]
wikidata_data = wikidata_data.rename(columns={
    'work':'wikidata_work_id',
    'workLabel':'wikidata_work_title',
    'creator':'wikidata_creator_id',
    'creatorLabel':'wikidata_creator_name'})

wikidata_creator_path = pathlib.Path.cwd().parents[0] / 'data' /  'wikidata_creator_data.parquet'
wikidata_data.to_parquet(wikidata_creator_path, index=False)

print(len(wikidata_data))
wikidata_data.head()

86177


Unnamed: 0,wikidata_work_id,wikidata_work_title,wikidata_creator_id,wikidata_creator_name
0,Q29600,Aimée & Jaguar,Q982982,Inge Keller
1,Q29600,Aimée & Jaguar,Q1038736,Carl Heinz Choynski
2,Q18405,Nights of Cabiria,Q1451173,François Périer
3,Q29600,Aimée & Jaguar,Q1691197,Jochen Stern
4,Q18405,Nights of Cabiria,Q2832333,Aldo Silvani
