In [1]:
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

def wikidata_connections(wikidata_subject_id):
    wikidata_query = ''' 
        select ?subject ?subjectLabel ?work ?workLabel ?colleague ?colleagueLabel where { 
            values ?subject {wd:'''+wikidata_subject_id+'''}
            ?work wdt:P161 ?subject .
            ?work wdt:P161 ?colleague .
            service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
            } '''

    wikidata_df = sparql_query(wikidata_query, 'https://query.wikidata.org/sparql')
    wikidata_df = wikidata_df.loc[wikidata_df.subject != wikidata_df.colleague]
    wikidata_df = wikidata_df[['work', 'workLabel', 'colleague', 'colleagueLabel']]
    wikidata_df = wikidata_df.rename(columns={'work': 'wikidata_work', 'colleague': 'wikidata_colleague'})
    wikidata_df['wikidata_work'] = wikidata_df['wikidata_work'].str.split('/').str[-1]
    wikidata_df['wikidata_colleague'] = wikidata_df['wikidata_colleague'].str.split('/').str[-1]

    return wikidata_df

def acmi_connections(acmi_subject_id):
    acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
    acmi_works = pandas.concat([
        acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creators'}),
        acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creators'})])

    acmi_works['creators'] = acmi_works['creators'].str.split(',')
    acmi_works = acmi_works.explode('creators')
    acmi_works['creators'] = acmi_works['creators'].str.strip()
    acmi_works = acmi_works.dropna(subset='creators')

    acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
    acmi_creators = acmi_creators[['id', 'name']].drop_duplicates().astype(str)

    acmi_df = acmi_creators.loc[acmi_creators.id.isin(['73769'])].rename(columns={'id':'subject', 'name':'subjectLabel'})
    acmi_df = pandas.merge(acmi_df, acmi_works.rename(columns={'creators':'subject', 'id':'work', 'title':'workLabel'}), on='subject', how='left')
    acmi_df = pandas.merge(acmi_df, acmi_works.rename(columns={'creators':'colleague', 'id':'work', 'title':'workLabel'}), on=['work', 'workLabel'], how='left')

    acmi_df = pandas.merge(acmi_df, acmi_creators.rename(columns={'id':'colleague', 'name':'colleagueLabel'}), on=['colleague'], how='left')

    acmi_df = acmi_df.loc[acmi_df.subject != acmi_df.colleague]
    acmi_df = acmi_df[['work', 'workLabel', 'colleague', 'colleagueLabel']]
    acmi_df = acmi_df.rename(columns={'work': 'acmi_work', 'colleague': 'acmi_colleague'})

    return acmi_df

start_node = {'wikidata':'Q241897', 'acmi':'73769'}

wikidata_data = wikidata_connections(start_node['wikidata'])
acmi_data = acmi_connections(start_node['acmi'])

mashup = pandas.merge(acmi_data, wikidata_data, on=['workLabel', 'colleagueLabel'], how='inner')
print(len(mashup))
mashup.head()

7


Unnamed: 0,acmi_work,workLabel,acmi_colleague,colleagueLabel,wikidata_work,wikidata_colleague
0,77912,Caddie,72924,Jack Thompson,Q5016297,Q356383
1,77912,Caddie,76125,Takis Emmanuel,Q5016297,Q25409285
2,77912,Caddie,76124,Helen Morse,Q5016297,Q1610584
3,80950,Petersen,76435,Wendy Hughes,Q7178200,Q276365
4,80950,Petersen,72924,Jack Thompson,Q7178200,Q356383
