In [12]:

# detect matches based on linked creator -> exact work title -> exact creator name.

import hashlib
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

def wikidata_connections(wikidata_subject_id):

    wikidata_query = ''' 
        select ?subject ?subjectLabel ?work ?workLabel ?colleague ?colleagueLabel where { 
            values ?subject {wd:'''+wikidata_subject_id+'''}
            ?work ?prop ?subject .
            ?work ?prop ?colleague .
            service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
            } '''

    wikidata_df = sparql_query(wikidata_query, 'https://query.wikidata.org/sparql')
    if len(wikidata_df):
        wikidata_df = wikidata_df.loc[wikidata_df.subject != wikidata_df.colleague]
        wikidata_df = wikidata_df[['work', 'workLabel', 'colleague', 'colleagueLabel']]
        wikidata_df = wikidata_df.rename(columns={'work': 'wikidata_work', 'colleague': 'wikidata_colleague'})
        wikidata_df['wikidata_work'] = wikidata_df['wikidata_work'].str.split('/').str[-1]
        wikidata_df['wikidata_colleague'] = wikidata_df['wikidata_colleague'].str.split('/').str[-1]

        return wikidata_df
    else:
        return pandas.DataFrame() 

def acmi_connections(acmi_subject_id):

    acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
    acmi_works = pandas.concat([
        acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creators'}),
        acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creators'})])

    acmi_works['creators'] = acmi_works['creators'].str.split(',')
    acmi_works = acmi_works.explode('creators')
    acmi_works['creators'] = acmi_works['creators'].str.strip()
    acmi_works = acmi_works.dropna(subset='creators')

    acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
    acmi_creators = acmi_creators[['id', 'name']].drop_duplicates().astype(str)

    acmi_df = acmi_creators.loc[acmi_creators.id.isin(['73769'])].rename(columns={'id':'subject', 'name':'subjectLabel'})
    acmi_df = pandas.merge(acmi_df, acmi_works.rename(columns={'creators':'subject', 'id':'work', 'title':'workLabel'}), on='subject', how='left')
    acmi_df = pandas.merge(acmi_df, acmi_works.rename(columns={'creators':'colleague', 'id':'work', 'title':'workLabel'}), on=['work', 'workLabel'], how='left')

    acmi_df = pandas.merge(acmi_df, acmi_creators.rename(columns={'id':'colleague', 'name':'colleagueLabel'}), on=['colleague'], how='left')

    acmi_df = acmi_df.loc[acmi_df.subject != acmi_df.colleague]
    acmi_df = acmi_df[['work', 'workLabel', 'colleague', 'colleagueLabel']]
    acmi_df = acmi_df.rename(columns={'work': 'acmi_work', 'colleague': 'acmi_colleague'})

    return acmi_df

def detect_connections(wikidata_creator_id, acmi_creator_id, extant):

    wikidata_data = wikidata_connections(wikidata_creator_id)
    if len(wikidata_data):
        acmi_data = acmi_connections(acmi_creator_id)

        mashup = pandas.merge(acmi_data, wikidata_data, on=['workLabel', 'colleagueLabel'], how='inner')
        mashup['acmi_work'] = 'works/'+mashup['acmi_work'].astype(str)
        mashup['acmi_colleague'] = 'creators/'+mashup['acmi_colleague'].astype(str)

        links = pandas.concat([
            mashup[['wikidata_colleague', 'acmi_colleague']].rename(
                columns={'wikidata_colleague': 'wikidata_id', 'acmi_colleague': 'acmi_id'}),
            mashup[['wikidata_work', 'acmi_work']].rename(
                columns={'wikidata_work': 'wikidata_id', 'acmi_work': 'acmi_id'}),
            ]).drop_duplicates()

        links = pandas.merge(links, extant, indicator=True, how='left').query('_merge=="left_only"').drop('_merge', axis=1)

        return links
    else:
        return pandas.DataFrame()

def extant_connections():

    extant_query = ''' select distinct ?wikidata_id ?acmi_id where { ?wikidata_id wdt:P7003 ?acmi_id } '''
    extant_links = sparql_query(extant_query, 'https://query.wikidata.org/sparql')
    extant_links['wikidata_id'] = extant_links['wikidata_id'].str.split('/').str[-1]

    return extant_links

extant_df = extant_connections()
extant_creators = extant_df.copy()
extant_creators = extant_creators.loc[extant_creators.acmi_id.str.contains('creators', na=False)]

for x in tqdm.tqdm(extant_creators.to_dict('records')):
    link_hash = hashlib.md5(x['wikidata_id'].encode()).hexdigest()
    summary_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_iterator' / f'{link_hash}.csv'
    summary_path.parents[0].mkdir(exist_ok=True, parents=True)
    if not summary_path.exists():
        res = detect_connections(x['wikidata_id'], x['acmi_id'], extant_df)
        res.to_csv(summary_path, index=False)

  2%|▏         | 117/4901 [08:10<5:34:18,  4.19s/it]


KeyboardInterrupt: 