In [1]:
# look for name matches where works are linked, but credits are not.

import numpy
import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

wikidata_query = ''' select ?wikidata_id ?acmi_id where { ?wikidata_id wdt:P7003 ?acmi_id } '''
wikidata_links = sparql_query(wikidata_query, 'https://query.wikidata.org/sparql')

acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = pandas.concat([
    acmi_works[['id', 'creators_primary']].rename(columns={'creators_primary':'creators'}),
    acmi_works[['id', 'creators_other']].rename(columns={'creators_other':'creators'})])

acmi_works['creators'] = acmi_works['creators'].str.split(',')
acmi_works = acmi_works.explode('creators')
acmi_works['creators'] = acmi_works['creators'].str.strip()
acmi_works = acmi_works.dropna(subset='creators')
acmi_works = acmi_works.rename(columns={'id':'work_id', 'creators':'creator_id'})

acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
acmi_creators = acmi_creators.astype(str)
acmi_creators = acmi_creators[['id', 'name']].rename(columns={'id':'creator_id'})

dataframe = pandas.merge(acmi_works, acmi_creators, on='creator_id', how='left')
dataframe['work_id'] = 'works/'+dataframe['work_id'].astype(str)
dataframe['creator_id'] = 'creators/'+dataframe['creator_id'].astype(str)

dataframe = dataframe.loc[dataframe.work_id.isin(list(wikidata_links.acmi_id))]
dataframe = dataframe.loc[~dataframe.creator_id.isin(list(wikidata_links.acmi_id))]
dataframe = pandas.merge(dataframe, wikidata_links.rename(columns={'acmi_id':'work_id', 'wikidata_id':'work_wikidata_id'}), on='work_id', how='left')
dataframe['work_wikidata_id'] = dataframe['work_wikidata_id'].str.split('/').str[-1]

print(len(dataframe))
dataframe.head(20)

13040


Unnamed: 0,work_id,creator_id,name,work_wikidata_id
0,works/87393,creators/29313,Applecross Productions,Q844883
1,works/87393,creators/29312,Reperage and Vanguard Films,Q844883
2,works/94621,creators/36702,,Q842442
3,works/94621,creators/31288,,Q842442
4,works/94621,creators/80920,Doris Yang,Q842442
5,works/92302,creators/24585,Alan Kleinberg,Q192409
6,works/92302,creators/24584,Grokenberger,Q192409
7,works/92302,creators/24583,Black Snake,Q192409
8,works/85612,creators/78000,John Kilik,Q1130705
9,works/85612,creators/48618,,Q1130705


In [2]:

agent_dataframe = pandas.DataFrame()
for chunk in numpy.array_split(dataframe.work_wikidata_id.unique(), 10):
    chunk_ids = ' '.join(['wd:'+x for x in chunk])
    query = ''' 
        select ?work ?agent ?agentLabel where {
        values ?work {'''+chunk_ids+'''}
        ?work ?prop ?agent .
        service wikibase:label { bd:serviceParam wikibase:language "en". }
        }'''
    agent_dataframe = pandas.concat([agent_dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])

agent_dataframe = agent_dataframe.loc[agent_dataframe.agent.str.contains('wikidata', na=False)]
agent_dataframe['work'] = agent_dataframe['work'].str.split('/').str[-1]
agent_dataframe['agent'] = agent_dataframe['agent'].str.split('/').str[-1]
agent_dataframe = agent_dataframe.rename(columns={'work': 'work_wikidata_id', 'agent': 'agent_wikidata_id', 'agentLabel':'name'})

print(len(agent_dataframe))
agent_dataframe.head(20)

578375


Unnamed: 0,work_wikidata_id,agent_wikidata_id,name
8,Q18402,Q11424,film
9,Q18402,Q7371,Federico Fellini
10,Q18402,Q7371,Federico Fellini
11,Q18402,Q222791,Ennio Flaiano
12,Q18402,Q545378,Tullio Pinelli
13,Q18402,Q214665,Nino Rota
14,Q18402,Q130232,drama film
15,Q18402,Q83484,Anthony Quinn
16,Q18402,Q106907,Giulietta Masina
17,Q18402,Q426820,Marcella Rovena


In [3]:
merge = pandas.merge(dataframe, agent_dataframe, on=['work_wikidata_id', 'name'], how='inner')
merge = merge[['name', 'creator_id', 'agent_wikidata_id']].drop_duplicates()
merge.to_csv(pathlib.Path.home() / 'Desktop' / 'acmi_creator_matches.csv')

print(len(merge))
merge.head()

10


Unnamed: 0,name,creator_id,agent_wikidata_id
0,Sogetel,creators/33880,Q3488910
1,Sam Jaffe,creators/76515,Q719247
2,Sam Taylor,creators/31199,Q719427
3,Gerald Green,creators/76874,Q722629
4,Gerald Green,creators/76874,Q1374400
