In [1]:
# create a list of wikidata creators
# then using acmi creator list find instance in which there are no reasonable matches

from rapidfuzz import process, fuzz
import hashlib
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

def normalise_string(input_text):

    ''' Normalise text for matching purposes. '''

    return unidecode.unidecode(str(input_text).lower()).strip()

wikidata_creator_data = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / 'wikidata_creator.parquet'
if not wikidata_creator_data.exists():
    query = '''
    select distinct ?creator 
        where {
            {?work wdt:P57 ?creator . } union
            {?work wdt:P58 ?creator . } union
            {?work wdt:P161 ?creator . } union
            {?work wdt:P272 ?creator . } union
            {?work wdt:P344 ?creator . } union
            {?work wdt:P1040 ?creator . } union
            {?work wdt:P2515 ?creator . } union
            {?work wdt:P2554 ?creator . } 
        } '''

    wikidata_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    wikidata_creators['creator'] = wikidata_creators['creator'].str.split('/').str[-1]

    wikidata_dataframe = pandas.DataFrame()
    for chunk in tqdm.tqdm(numpy.array_split(wikidata_creators.creator.unique(), 2000)):
        time.sleep(4)
        query = '''
            select distinct ?creator ?creatorLabel
            where {
                values ?creator {'''+' '.join([f'wd:{x}' for x in chunk])+'''}
                service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
            } '''
        wikidata_dataframe = pandas.concat([wikidata_dataframe, sparql_query(query, 'https://query.wikidata.org/sparql')])
    wikidata_dataframe.to_parquet(wikidata_creator_data)
else:
    wikidata_dataframe = pandas.read_parquet(wikidata_creator_data)

wikidata_dataframe['creatorLabel'] = wikidata_dataframe['creatorLabel'].str.upper()

print(len(wikidata_dataframe))
wikidata_dataframe.head()



338842


Unnamed: 0,creator,creatorLabel
0,http://www.wikidata.org/entity/Q261,LINKIN PARK
1,http://www.wikidata.org/entity/Q272,PAUL MORAND
2,http://www.wikidata.org/entity/Q1225,BRUCE SPRINGSTEEN
3,http://www.wikidata.org/entity/Q11319,DAVID DECOTEAU
4,http://www.wikidata.org/entity/Q181,JIMMY WALES


In [2]:
# imdb dataframe

imdb_data_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / 'imdb_data.parquet'
if not imdb_data_path.exists():
    imdb_data = pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.principals.tsv', delimiter='\t', low_memory=False)
    imdb_data = pandas.merge(imdb_data, pandas.read_csv(pathlib.Path.home() / 'imdb' / 'title.basics.tsv', delimiter='\t', low_memory=False), on='tconst', how='left')
    imdb_data = pandas.concat([
        imdb_data[['nconst', 'primaryTitle']].rename(columns={'primaryTitle':'title'}),
        imdb_data[['nconst', 'originalTitle']].rename(columns={'originalTitle':'title'})]).drop_duplicates()
    imdb_data.to_parquet(imdb_data_path, index=False)
else:
    imdb_data = pandas.read_parquet(imdb_data_path)
    
imdb_data['title'] = imdb_data['title'].str.upper()

print(len(imdb_data))
imdb_data.head()

52400147


Unnamed: 0,nconst,title
0,nm1588970,CARMENCITA
1,nm0005690,CARMENCITA
2,nm0374658,CARMENCITA
3,nm0721526,LE CLOWN ET SES CHIENS
4,nm1335271,LE CLOWN ET SES CHIENS


In [3]:
acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
acmi_works = pandas.concat([
    acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id'}),
    acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id'})
])

acmi_works['creator_id'] = acmi_works['creator_id'].str.split(',')
acmi_works = acmi_works.explode('creator_id')
acmi_works['creator_id'] = acmi_works['creator_id'].str.strip()
acmi_works = acmi_works.drop_duplicates().fillna('')
acmi_works = acmi_works.loc[~acmi_works.creator_id.isin([''])]

acmi_works['id'] = 'works/'+acmi_works['id'].astype(str)
acmi_works['creator_id'] = 'creators/'+acmi_works['creator_id'].astype(str)

for x in ['[DVD]', '[Widescreen]', '[NTSC]', '[B&W]', '[Italian version]',
    '[Edited version]', '[Greek version]', '[study extract]', '[Dubbed]',
    '[Turkish version]', '[game trailer]', '[a discussion]']:
    acmi_works['title'] = acmi_works['title'].str.replace(x, '')

acmi_works['title'] = acmi_works['title'].str.split('=')
acmi_works = acmi_works.explode('title')
acmi_works['title'] = acmi_works['title'].str.strip()
acmi_works['title'] = acmi_works['title'].str.upper()

print(len(acmi_works))
acmi_works.head()

122473


Unnamed: 0,id,title,creator_id
0,works/119934,THE DAME WAS LOADED GERMAN ADVERTISEMENT,creators/41813
2,works/90799,WING CHUN,creators/32508
3,works/90495,THE FLYING DOCTOR,creators/11967
3,works/90495,THE FLYING DOCTOR,creators/12786
3,works/90495,THE FLYING DOCTOR,creators/32223


In [29]:
import hashlib

query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id .
        filter(regex(str(?acmi_id), "creators")) .
        } '''

extant_links = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
acmi_creators = acmi_creators[['id', 'name']].rename(columns={'id':'acmi_id', 'name':'acmi_label'})
acmi_creators['acmi_id'] = 'creators/'+acmi_creators['acmi_id'].astype(str)
acmi_creators = acmi_creators.loc[~acmi_creators.acmi_id.isin(list(extant_links.acmi_id))]
acmi_creators['acmi_label'] = acmi_creators['acmi_label'].str.upper()

# ideal here is to add increasing checks

def wikidata_titles(wikidata):

    query = '''
    select distinct ?workLabel 
    where {
        values ?creator {wd:'''+wikidata+'''}
        {?work wdt:P57 ?creator . } union
        {?work wdt:P58 ?creator . } union
        {?work wdt:P161 ?creator . } union
        {?work wdt:P272 ?creator . } union
        {?work wdt:P344 ?creator . } union
        {?work wdt:P1040 ?creator . } union
        {?work wdt:P2515 ?creator . } union
        {?work wdt:P2554 ?creator . } 
        service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
    } '''

    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.workLabel.unique()

def wikipedia_page(wikidata):

    query = '''
    select distinct ?article
    where { 
        values ?creator {wd:'''+wikidata+'''} .
        ?article schema:about ?creator .
        ?article schema:isPartOf <https://en.wikipedia.org/>  .
    } '''
    
    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.article.unique()

def imdb_page(wikidata):
    
    query = '''
    select distinct ?imdb
    where { 
        values ?creator {wd:'''+wikidata+'''} .
        ?creator wdt:P345 ?imdb .
    } '''
    
    titles = sparql_query(query, 'https://query.wikidata.org/sparql')
    if len(titles):
        return titles.imdb.unique()


for creator in tqdm.tqdm(acmi_creators.to_dict('records')):

    #creator = {'acmi_id': 'creators/72103', 'acmi_label':'Steven Spielberg'}

    hash_id = hashlib.md5(creator['acmi_id'].encode()).hexdigest()
    hash_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' / f'{hash_id}.txt'

    if not hash_path.exists():

        acmi_filmography = acmi_works.copy()
        acmi_filmography = acmi_filmography.loc[acmi_filmography.creator_id.isin([creator['acmi_id']])]
        acmi_filmography = [x['title'] for x in acmi_filmography.to_dict('records')]

        c = process.extract(creator['acmi_label'].upper(), wikidata_dataframe.creatorLabel.unique(), scorer=fuzz.WRatio, limit=10)
        candidates = [x[0] for x in c if x[1] > 80] 

        match = ''

        if len(candidates):

            candidate_dataframe = wikidata_dataframe.copy()
            candidate_dataframe = candidate_dataframe.loc[candidate_dataframe.creatorLabel.isin(candidates)]

            for wikidata_candidate in candidate_dataframe.creator.unique():
                wikidata_id = wikidata_candidate.split('/')[-1]

                if match == '':
                    wiki_titles = wikidata_titles(wikidata_id)
                    if wiki_titles is not None:
                        wikidata_titles_array = [a.upper() for a in wiki_titles]
                        for g in acmi_filmography:
                            if g in wikidata_titles_array:
                                match = wikidata_candidate

                if match == '':
                    imdb_page_result = imdb_page(wikidata_id)
                    if imdb_page_result is not None:
                        if len(imdb_page_result) == 1:
                            imdb_data_candidate = imdb_data.loc[imdb_data.nconst.isin([imdb_page_result[0]])]
                            for g in acmi_filmography:
                                if g in imdb_data_candidate.title.unique():
                                    match = wikidata_candidate
 
                if match == '':
                    wikipedia_page_result = wikipedia_page(wikidata_id)
                    if wikipedia_page_result:
                        if len(wikipedia_page_result) == 1:
                            r = requests.get(wikipedia_page_result[0])  
                            if r.status_code == 200:
                                for g in acmi_filmography:
                                    if g in r.text.upper():
                                        match = wikidata_candidate
                            else:
                                print('connection error')
            

        with open(hash_path, 'w') as export:
            export.write(f"{creator['acmi_id']} {match}")

 17%|█▋        | 2374/13658 [1:17:56<178:37:05, 56.99s/it]

In [28]:

from rapidfuzz import process, fuzz
import hashlib
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode
from wikibaseintegrator import WikibaseIntegrator, wbi_login, datatypes
from wikibaseintegrator.models import Claims, Qualifiers, References, Reference
from wikibaseintegrator.wbi_config import config
from wikibaseintegrator.wbi_enums import ActionIfExists
import json


def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

report = pandas.DataFrame(columns=['text'])
frag_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' 
for x in [x for x in frag_path.iterdir() if x.suffix == '.txt']:
    with open(x) as fragment:
        report.loc[len(report)] = [(fragment.read())]

report = report.loc[~report.text.str.contains('wiki', na=False)]
report['text'] = report['text'].str.split(' ').str[0]
report = report.rename(columns={'text':'acmi_link'})
report['acmi_id'] = report['acmi_link'].str.split('/').str[1].str.strip()

acmi_roles = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'data' / 'creator_roles' / 'creator_roles.csv', low_memory=False)
acmi_roles = acmi_roles[['acmi_creator_id', 'acmi_creator_name', 'wikidata_type']]
acmi_roles['acmi_creator_id'] = acmi_roles['acmi_creator_id'].astype(str)
acmi_roles = acmi_roles.rename(columns={'acmi_creator_id':'acmi_id'})

report = pandas.merge(report, acmi_roles, on='acmi_id', how='left')

report = report.dropna().drop_duplicates(subset='acmi_link', keep='first')

query = '''
    select distinct ?wd ?acmi
    where {?wd wdt:P7003 ?acmi} '''

extant_acmi = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
report = report.loc[~report.acmi_link.isin(list(extant_acmi.acmi))]


# with open(pathlib.Path.home() / 'wikidata_login.json') as wd_cred:
#     wd_cred = json.load(wd_cred)

# config['USER_AGENT'] = 'acmi-notebooks (https://github.com/paulduchesne/acmi-notebooks)'
# login_wikidata = wbi_login.Login(user=wd_cred['username'], password=wd_cred['password'], mediawiki_api_url='https://www.wikidata.org/w/api.php')
# wbi = WikibaseIntegrator(login=login_wikidata)


# for x in tqdm.tqdm(report.to_dict('records')):


#     acmi_ref = References()
#     ref = Reference()
#     ref.add(datatypes.URL(prop_nr='P854', value=f"https://www.acmi.net.au/{str(x['acmi_link'])}"))
#     acmi_ref.add(ref)

#     new_creator = wbi.item.new()
#     new_creator.labels.set('en', x['acmi_creator_name'])

#     claim = datatypes.Item(prop_nr='P31', value=str(x['wikidata_type']), references=acmi_ref)    
#     new_creator.claims.add(claim, action_if_exists=ActionIfExists.APPEND_OR_REPLACE)

#     claim = datatypes.ExternalID(prop_nr='P7003', value=str(x['acmi_link']), references=acmi_ref)    
#     new_creator.claims.add(claim, action_if_exists=ActionIfExists.APPEND_OR_REPLACE)




#     # this is the command which actually makes the write to wikidata.
#     r = new_creator.write()

#    # print(new_creator)
#    # print(r)


print(len(report))
report.head()

* main: Subscribe to the mediawiki-api-announce mailing list at <https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/> for notice of API deprecations and breaking changes. Use [[Special:ApiFeatureUsage]] to see usage of deprecated features by your application.
100%|██████████| 266/266 [03:56<00:00,  1.12it/s]

266





Unnamed: 0,acmi_link,acmi_id,acmi_creator_name,wikidata_type
122,creators/79205,79205,Allan Martel,Q5
123,creators/80479,80479,Virginia Moncrieff,Q5
169,creators/85174,85174,Gwen McCrorey,Q5
328,creators/31546,31546,Heus-Stept Productions,Q11396960
331,creators/78448,78448,Shaun Farrington,Q11396960


In [25]:
match_candidates = pandas.DataFrame(columns=['text'])
frag_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_match' 
for x in [x for x in frag_path.iterdir() if x.suffix == '.txt']:
    with open(x) as fragment:
        match_candidates.loc[len(match_candidates)] = [(fragment.read())]

match_candidates = match_candidates.loc[match_candidates.text.str.contains('wiki', na=False)]
match_candidates['acmi'] = match_candidates['text'].str.split(' ').str[0]
match_candidates['wikidata'] = match_candidates['text'].str.split(' ').str[1]
match_candidates['wikidata'] = match_candidates['wikidata'].str.split('/').str[-1]
match_candidates['acmi_link'] = 'https://www.acmi.net.au/'+match_candidates['acmi']

query = '''
    select distinct ?wd ?acmi
    where {?wd wdt:P7003 ?acmi} '''

extant_acmi = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
match_candidates = match_candidates.loc[~match_candidates.acmi.isin(list(extant_acmi.acmi))]

match_candidates.to_csv(pathlib.Path.home() / 'Desktop' / 'acmi_manual.csv', index=False)
print(len(match_candidates))
match_candidates.head()

582


Unnamed: 0,text,acmi,wikidata,acmi_link
72,creators/83370 http://www.wikidata.org/entity/...,creators/83370,Q279413,https://www.acmi.net.au/creators/83370
76,creators/78769 http://www.wikidata.org/entity/...,creators/78769,Q152239,https://www.acmi.net.au/creators/78769
78,creators/73620 http://www.wikidata.org/entity/...,creators/73620,Q189022,https://www.acmi.net.au/creators/73620
97,creators/82457 http://www.wikidata.org/entity/...,creators/82457,Q3566046,https://www.acmi.net.au/creators/82457
98,creators/82730 http://www.wikidata.org/entity/...,creators/82730,Q24288349,https://www.acmi.net.au/creators/82730
