In [1]:
from rapidfuzz import process, fuzz
import numpy
import pandas
import pathlib
import pydash
import requests

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

def format_creators(row):

    creators = []

    if row['creators_primary'] is not numpy.nan:
        creators += row['creators_primary'].split(',')

    if row['creators_other'] is not numpy.nan:
        creators += row['creators_other'].split(',')
        
    if len(creators):
        return pydash.uniq([f'creators/{x}' for x in creators])
    else:
        return None

def creator_link_check(row, extant):

    extant_creator = [(x in extant) for x in row['creator_id']]
    return all(extant_creator)


def related_entities(row):
    creator_array = row['creator_id']
    creator_array = ' '.join([f'"{x}"' for x in creator_array])

    try:
        query = '''
            select distinct ?creators ?wikidata ?element ?elementLabel where {
                values ?creators {'''+creator_array+'''}
                ?wikidata wdt:P7003 ?creators .
                ?element ?p ?wikidata .
                ?element wdt:P31 ?elementType .
                service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
            } '''

        related_entities = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
        if len(related_entities):
            return (list(related_entities.elementLabel.unique()))
        else:
            return []
    except:
        return 'error'

def title_matching(row):

    c = process.extract(row['title'], row['entity_array'], scorer=fuzz.WRatio, limit=50)
    candidates = [x for x in c if x[1] > 85] 
    return candidates


def count_array(row):
    return len(row['title_matches'])


query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id .
        } '''

extant_links = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

print(len(extant_links))
extant_links.head()

14663


Unnamed: 0,wikidata_id,acmi_id
0,http://www.wikidata.org/entity/Q6255888,creators/21267
1,http://www.wikidata.org/entity/Q63069,creators/32979
2,http://www.wikidata.org/entity/Q2158449,creators/80769
3,http://www.wikidata.org/entity/Q3195038,creators/15007
4,http://www.wikidata.org/entity/Q3672555,creators/22745


In [2]:
acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = acmi_works.loc[acmi_works.record_type.isin(['work'])]

for x in ['[DVD]', '[Widescreen]', '[NTSC]', '[B&W]', '[Italian version]',
    '[Edited version]', '[Greek version]', '[study extract]', '[Dubbed]',
    '[Turkish version]', '[game trailer]', '[a discussion]']:
    acmi_works['title'] = acmi_works['title'].str.replace(x, '')

acmi_works = acmi_works.rename(columns={'id':'work_id'})
acmi_works['creator_id'] = acmi_works.apply(format_creators, axis=1)
acmi_works['work_id'] = 'works/'+acmi_works['work_id'].astype(str)
acmi_works = acmi_works[['work_id', 'title', 'type', 'creator_id']].dropna()
acmi_works = acmi_works.loc[~acmi_works.work_id.isin(list(extant_links.acmi_id))]
acmi_works['creator_linked'] = acmi_works.apply(creator_link_check, extant=list(extant_links.acmi_id), axis=1)
acmi_works = acmi_works.loc[acmi_works.creator_linked]

print(len(acmi_works))
acmi_works = acmi_works[:500]
print(len(acmi_works))

# okay now work through one by one pulling concated filmographies and finding any vague matches to title
# then return two columns, match numbers and highest score.

acmi_works['entity_array'] = acmi_works.apply(related_entities, axis=1)
acmi_works = acmi_works.loc[~acmi_works.entity_array.isin(['error'])]
acmi_works['title_matches'] = acmi_works.apply(title_matching, axis=1)
acmi_works['candidate_count'] = acmi_works.apply(count_array, axis=1)
acmi_works = acmi_works.loc[acmi_works.candidate_count == 0]

print(len(acmi_works))
acmi_works.head()

KeyboardInterrupt: 

In [None]:
# well, these can be written, no?