In [38]:
# Look for name matches where works are linked, but credits are not.

import numpy
import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

wikidata_query = ''' select ?wikidata_id ?acmi_id where { ?wikidata_id wdt:P7003 ?acmi_id } '''
wikidata_links = sparql_query(wikidata_query, 'https://query.wikidata.org/sparql')

acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = pandas.concat([
    acmi_works[['id', 'creators_primary']].rename(columns={'creators_primary':'creators'}),
    acmi_works[['id', 'creators_other']].rename(columns={'creators_other':'creators'})])

acmi_works['creators'] = acmi_works['creators'].str.split(',')
acmi_works = acmi_works.explode('creators')
acmi_works['creators'] = acmi_works['creators'].str.strip()
acmi_works = acmi_works.dropna(subset='creators')
acmi_works = acmi_works.rename(columns={'id':'work_id', 'creators':'creator_id'})

acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
acmi_creators = acmi_creators.astype(str)
acmi_creators = acmi_creators[['id', 'name']].rename(columns={'id':'creator_id'})

dataframe = pandas.merge(acmi_works, acmi_creators, on='creator_id', how='left')
dataframe['work_id'] = 'works/'+dataframe['work_id'].astype(str)
dataframe['creator_id'] = 'creators/'+dataframe['creator_id'].astype(str)

dataframe = dataframe.loc[dataframe.work_id.isin(list(wikidata_links.acmi_id))]
dataframe = dataframe.loc[~dataframe.creator_id.isin(list(wikidata_links.acmi_id))]
dataframe = pandas.merge(dataframe, wikidata_links.rename(columns={'acmi_id':'work_id', 'wikidata_id':'work_wikidata_id'}), on='work_id', how='left')
dataframe['work_wikidata_id'] = dataframe['work_wikidata_id'].str.split('/').str[-1]

print(len(dataframe))
dataframe.head(20)

13973


Unnamed: 0,work_id,creator_id,name,work_wikidata_id
0,works/100501,creators/35703,,Q767461
1,works/100501,creators/35704,Arta Film,Q767461
2,works/100502,creators/71975,Mohammad Nikbin,Q5814772
3,works/100502,creators/35706,Iranian Film Society,Q5814772
4,works/100502,creators/35704,Arta Film,Q5814772
5,works/100503,creators/36214,Casablanca,Q845235
6,works/100503,creators/72754,,Q845235
7,works/100503,creators/31195,Noe Productions,Q845235
8,works/100503,creators/36212,Man's Films,Q845235
9,works/100503,creators/36211,Fabrica Films,Q845235


In [39]:

agent_dataframe = pandas.DataFrame()
for chunk in numpy.array_split(dataframe.work_wikidata_id.unique(), 10):
    chunk_ids = ' '.join(['wd:'+x for x in chunk])
    query = ''' 
        select ?work ?agent ?agentLabel where {
        values ?work {'''+chunk_ids+'''}
        ?work ?prop ?agent .
        service wikibase:label { bd:serviceParam wikibase:language "en". }
        }'''
    agent_dataframe = pandas.concat([agent_dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])

agent_dataframe = agent_dataframe.loc[agent_dataframe.agent.str.contains('wikidata', na=False)]
agent_dataframe['work'] = agent_dataframe['work'].str.split('/').str[-1]
agent_dataframe['agent'] = agent_dataframe['agent'].str.split('/').str[-1]
agent_dataframe = agent_dataframe.rename(columns={'work': 'work_wikidata_id', 'agent': 'agent_wikidata_id', 'agentLabel':'name'})

print(len(agent_dataframe))
agent_dataframe.head(20)

584567


Unnamed: 0,work_wikidata_id,agent_wikidata_id,name
46,Q25513,Q11424,film
47,Q25513,Q167475,Jacques Rivette
48,Q25513,Q167475,Jacques Rivette
49,Q25513,Q1059668,Jean Gruault
50,Q25513,Q3379247,Philippe Arthuys
51,Q25513,Q130232,drama film
52,Q25513,Q53001,Jean-Luc Godard
53,Q25513,Q55375,Claude Chabrol
54,Q25513,Q106326,Jean-Claude Brialy
55,Q25513,Q167475,Jacques Rivette


In [40]:
merge = pandas.merge(dataframe, agent_dataframe, on=['work_wikidata_id', 'name'], how='inner')
merge = merge[['name', 'creator_id', 'agent_wikidata_id']].drop_duplicates()
merge.to_csv(pathlib.Path.home() / 'Desktop' / 'acmi_creator_matches.csv')

print(len(merge))
merge.head()

366


Unnamed: 0,name,creator_id,agent_wikidata_id
0,National Film Board of Canada,creators/86050,Q1530721
2,Paramount Pictures,creators/11230,Q159846
6,United Artists Corporation,creators/20365,Q219400
8,Shochiku,creators/30856,Q122549
10,Takeshi Yamamoto,creators/82206,Q2741558
