In [1]:
# create a list of wikidata creators
# then using acmi creator list find instance in which there are no reasonable matches

from rapidfuzz import process, fuzz
import hashlib
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

def normalise_string(input_text):

    ''' Normalise text for matching purposes. '''

    return unidecode.unidecode(str(input_text).lower()).strip()

wikidata_creator_data = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / 'wikidata_creator.parquet'
if not wikidata_creator_data.exists():
    query = '''
    select distinct ?creator 
        where {
            {?work wdt:P57 ?creator . } union
            {?work wdt:P58 ?creator . } union
            {?work wdt:P161 ?creator . } union
            {?work wdt:P272 ?creator . } union
            {?work wdt:P344 ?creator . } union
            {?work wdt:P1040 ?creator . } union
            {?work wdt:P2515 ?creator . } union
            {?work wdt:P2554 ?creator . } 
        } '''

    wikidata_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    wikidata_creators['creator'] = wikidata_creators['creator'].str.split('/').str[-1]

    wikidata_dataframe = pandas.DataFrame()
    for chunk in tqdm.tqdm(numpy.array_split(wikidata_creators.creator.unique(), 2000)):
        time.sleep(4)
        query = '''
            select distinct ?creator ?creatorLabel
            where {
                values ?creator {'''+' '.join([f'wd:{x}' for x in chunk])+'''}
                service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
            } '''
        wikidata_dataframe = pandas.concat([wikidata_dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])
    wikidata_dataframe.to_parquet(wikidata_creator_data)
else:
    wikidata_dataframe = pandas.read_parquet(wikidata_creator_data)

wikidata_dataframe['creatorLabel'] = wikidata_dataframe['creatorLabel'].str.lower()

print(len(wikidata_dataframe))
wikidata_dataframe.head()

338757


Unnamed: 0,creator,creatorLabel
0,http://www.wikidata.org/entity/Q261,linkin park
1,http://www.wikidata.org/entity/Q272,paul morand
2,http://www.wikidata.org/entity/Q1225,bruce springsteen
3,http://www.wikidata.org/entity/Q11319,david decoteau
4,http://www.wikidata.org/entity/Q181,jimmy wales


In [2]:
# imdb dataframe

# what do you need to know, nameconst, connected to names of films, try jacki weaver


imdb_data_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / 'imdb_data.parquet'
if not imdb_data_path.exists():
    imdb_data = pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.principals.tsv', delimiter='\t', low_memory=False)
    imdb_data = pandas.merge(imdb_data, pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.basics.tsv', delimiter='\t', low_memory=False), on='tconst', how='left')
    imdb_data = pandas.concat([
        imdb_data[['nconst', 'primaryTitle']].rename(columns={'primaryTitle':'title'}),
        imdb_data[['nconst', 'originalTitle']].rename(columns={'originalTitle':'title'})]).drop_duplicates()
    imdb_data.to_parquet(imdb_data_path, index=False)
else:
    imdb_data = pandas.read_parquet(imdb_data_path)
    
print(len(imdb_data))
imdb_data.head()

52400147


Unnamed: 0,nconst,title
0,nm1588970,Carmencita
1,nm0005690,Carmencita
2,nm0374658,Carmencita
3,nm0721526,Le clown et ses chiens
4,nm1335271,Le clown et ses chiens


In [3]:
acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = pandas.concat([
    acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id'}),
    acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id'})
])

acmi_works['creator_id'] = acmi_works['creator_id'].str.split(',')
acmi_works = acmi_works.explode('creator_id')
acmi_works['creator_id'] = acmi_works['creator_id'].str.strip()
acmi_works = acmi_works.drop_duplicates().fillna('')
acmi_works['id'] = 'works/'+acmi_works['id'].astype(str)
acmi_works['creator_id'] = 'creators/'+acmi_works['creator_id'].astype(str)

print(len(acmi_works))
acmi_works.head()

144784


Unnamed: 0,id,title,creator_id
0,works/119934,The Dame Was Loaded German advertisement,creators/41813
1,works/115143,World Is Ours,creators/
2,works/90799,Wing Chun,creators/32508
3,works/90495,The Flying doctor,creators/11967
3,works/90495,The Flying doctor,creators/12786


In [6]:
import hashlib

query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id .
        filter(regex(str(?acmi_id), "creators")) .
        } '''

extant_links = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
acmi_creators = acmi_creators[['id', 'name']].rename(columns={'id':'acmi_id', 'name':'acmi_label'})
acmi_creators['acmi_id'] = 'creators/'+acmi_creators['acmi_id'].astype(str)
acmi_creators = acmi_creators.loc[~acmi_creators.acmi_id.isin(list(extant_links.acmi_id))]

# ideal here is to add increasing checks

def wikidata_titles(wikidata):

    query = '''
    select distinct ?workLabel 
    where {
        values ?creator {wd:'''+wikidata+'''}
        {?work wdt:P57 ?creator . } union
        {?work wdt:P58 ?creator . } union
        {?work wdt:P161 ?creator . } union
        {?work wdt:P272 ?creator . } union
        {?work wdt:P344 ?creator . } union
        {?work wdt:P1040 ?creator . } union
        {?work wdt:P2515 ?creator . } union
        {?work wdt:P2554 ?creator . } 
        service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
    } '''

    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.workLabel.unique()

def wikipedia_page(wikidata):

    query = '''
    select distinct ?article
    where { 
        values ?creator {wd:'''+wikidata+'''} .
        ?article schema:about ?creator .
        ?article schema:isPartOf <https://en.wikipedia.org/>  .
    } '''
    
    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.article.unique()

def imdb_page(wikidata):
    
    query = '''
    select distinct ?imdb
    where { 
        values ?creator {wd:'''+wikidata+'''} .
        ?creator wdt:P345 ?imdb .
    } '''
    
    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.imdb.unique()


creator_result = pandas.DataFrame(columns=['acmi', 'wikidata'])

for creator in tqdm.tqdm(acmi_creators.to_dict('records')):
    
    hash_id = hashlib.md5(creator['acmi_id'].encode()).hexdigest()
    hash_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / f'{hash_id}.txt'

    if not hash_path.exists():

        c = process.extract(creator['acmi_label'], wikidata_dataframe.creatorLabel.unique(), scorer=fuzz.WRatio, limit=20)
        candidates = [x[0] for x in c if x[1] > 75] 

        match = ''

        if len(candidates):

            acmi_filmography = acmi_works.copy()
            acmi_filmography = acmi_filmography.loc[acmi_filmography.creator_id.isin([creator['acmi_id']])]
            acmi_filmography = [x['title'] for x in acmi_filmography.to_dict('records')]

            candidate_dataframe = wikidata_dataframe.copy()
            candidate_dataframe = candidate_dataframe.loc[candidate_dataframe.creatorLabel.isin(candidates)]

            for wikidata_candidate in candidate_dataframe.creator.unique():

                wikidata_id = wikidata_candidate.split('/')[-1]
                wikidata_titles_array = wikidata_titles(wikidata_id)
                film_results = [process.extractOne(f[1], wikidata_titles_array, scorer=fuzz.WRatio)[1] for f in acmi_filmography]
                wikipedia_page_result = wikipedia_page(wikidata_id)
                if wikipedia_page_result:
                    if len(wikipedia_page_result) == 1:
                        time.sleep(2)
                        r = requests.get(wikipedia_page_result[0]).text  
                        for g in acmi_filmography:
                            if g in r:
                                match = wikidata_candidate
        
                imdb_page_result = imdb_page(wikidata_id)
                if imdb_page_result is not None:
                    if len(imdb_page_result) == 1:
                        imdb_data_candidate = imdb_data.loc[imdb_data.nconst.isin([imdb_page_result[0]])]
                        for g in acmi_filmography:
                            if g in imdb_data_candidate.title.unique():
                                match = wikidata_candidate

        with open(hash_path, 'w') as export:
            export.write(f"{creator['acmi_id']} {match}")

  3%|▎         | 563/17616 [55:34<28:03:26,  5.92s/it]  


KeyboardInterrupt: 