In [1]:
import numpy
import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

def format_creators(row):

    creators = []

    if row['creators_primary'] is not numpy.nan:
        creators += row['creators_primary'].split(',')

    if row['creators_other'] is not numpy.nan:
        creators += row['creators_other'].split(',')
        
    if len(creators):
        return pydash.uniq([f'creators/{x}' for x in creators])
    else:
        return None

def creator_link_check(row, extant):

    extant_creator = [(x in extant) for x in row['creator_id']]
    return all(extant_creator)

query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id .
        } '''

extant_links = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

print(len(extant_links))
extant_links.head()

14662


Unnamed: 0,wikidata_id,acmi_id
0,http://www.wikidata.org/entity/Q953520,creators/80957
1,http://www.wikidata.org/entity/Q1117163,creators/78375
2,http://www.wikidata.org/entity/Q446427,creators/54792
3,http://www.wikidata.org/entity/Q51114,creators/62487
4,http://www.wikidata.org/entity/Q7964856,creators/32701


In [2]:


acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = acmi_works.loc[acmi_works.record_type.isin(['work'])]
acmi_works = acmi_works.rename(columns={'id':'work_id'})
acmi_works['creator_id'] = acmi_works.apply(format_creators, axis=1)
acmi_works['work_id'] = 'works/'+acmi_works['work_id'].astype(str)
acmi_works = acmi_works[['work_id', 'title', 'creator_id']].dropna()
acmi_works = acmi_works.loc[~acmi_works.work_id.isin(list(extant_links.acmi_id))]
acmi_works['creator_linked'] = acmi_works.apply(creator_link_check, extant=list(extant_links.acmi_id), axis=1)
acmi_works = acmi_works.loc[acmi_works.creator_linked]

# okay now work through one by one pulling concated filmographies and finding any vague matches to title
# then return two columns, match numbers and highest score.

# so first process, pull in all linked titles from creators
# then run rapidfuzz to return results over a certain threshold

print(len(acmi_works))
acmi_works.head()

1944


Unnamed: 0,work_id,title,creator_id,creator_linked
0,works/119934,The Dame Was Loaded German advertisement,[creators/41813],True
29,works/99275,The road to total war,"[creators/76290, creators/86050]",True
50,works/64547,Adventure playground: London,"[creators/11442, creators/11443]",True
52,works/78485,Trapeze,"[creators/20569, creators/12143]",True
58,works/77749,Tennis club,[creators/12329],True
