In [8]:
import numpy
import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id .
        filter(regex(str(?acmi_id), "creators")) .
        } '''

extant_links = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

print(len(extant_links))
extant_links.head()

8893


Unnamed: 0,wikidata_id,acmi_id
0,http://www.wikidata.org/entity/Q953520,creators/80957
1,http://www.wikidata.org/entity/Q1117163,creators/78375
2,http://www.wikidata.org/entity/Q446427,creators/54792
3,http://www.wikidata.org/entity/Q51114,creators/62487
4,http://www.wikidata.org/entity/Q7964856,creators/32701


In [24]:
import numpy

def format_creators(row):

    creators = []

    if row['creators_primary'] is not numpy.nan:
        creators += row['creators_primary'].split(',')

    if row['creators_other'] is not numpy.nan:
        creators += row['creators_other'].split(',')

    if len(creators):
        return pydash.uniq([f'creators/{x}' for x in creators])
    else:
        return None


acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = acmi_works.loc[acmi_works.record_type.isin(['work'])]
acmi_works = acmi_works.rename(columns={'id':'work_id'})

acmi_works['creator_id'] = acmi_works.apply(format_creators, axis=1)

# acmi_works = pandas.concat([
#     acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id'}),
#     acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id'})
# ])

# acmi_works['creator_id'] = acmi_works['creator_id'].str.split(',')
# acmi_works = acmi_works.explode('creator_id')
# acmi_works['creator_id'] = acmi_works['creator_id'].str.strip()
# acmi_works = acmi_works.drop_duplicates().fillna('')
# acmi_works = acmi_works.loc[~acmi_works.creator_id.isin([''])]

# acmi_works['id'] = 'works/'+acmi_works['id'].astype(str)
# acmi_works['creator_id'] = 'creators/'+acmi_works['creator_id'].astype(str)

# for x in ['[DVD]', '[Widescreen]', '[NTSC]', '[B&W]', '[Italian version]',
#     '[Edited version]', '[Greek version]', '[study extract]', '[Dubbed]',
#     '[Turkish version]', '[game trailer]', '[a discussion]']:
#     acmi_works['title'] = acmi_works['title'].str.replace(x, '')

# acmi_works['title'] = acmi_works['title'].str.split('=')
# acmi_works = acmi_works.explode('title')
# acmi_works['title'] = acmi_works['title'].str.strip()
# acmi_works['title'] = acmi_works['title'].str.upper()


acmi_works = acmi_works[['work_id', 'title', 'creator_id']].dropna()

print(len(acmi_works))
acmi_works.head()

39309


Unnamed: 0,work_id,title,creator_id
0,119934,The Dame Was Loaded German advertisement,[creators/41813]
2,90799,Wing Chun,"[creators/32508, creators/77967]"
3,90495,The Flying doctor,"[creators/11967, creators/12786, creators/32223]"
4,95860,Sugar from Queensland,"[creators/74432, creators/12760, creators/12759]"
5,87768,"Empty harbours, empty dreams","[creators/77370, creators/76328, creators/7476..."
