In [1]:
import pandas
import pathlib

acmi_data = pandas.read_parquet(pathlib.Path.cwd().parents[0] / 'data' / 'acmi_data.parquet')
acmi_data = acmi_data[['acmi_work_id', 'acmi_creator_id', 'acmi_creator_name']].astype('str')
acmi_data = acmi_data.rename(columns={
    'acmi_work_id':'work_id', 'acmi_creator_id':'creator_id', 'acmi_creator_name':'creator_name'})

acmi_data['work_id'] = acmi_data['work_id'].astype(str)
acmi_data['creator_id'] = acmi_data['creator_id'].astype(str)

print(len(acmi_data))
acmi_data.head()

100182


Unnamed: 0,work_id,creator_id,creator_name
0,116936,83676,Jessica Hobbs
1,116936,33376,Jacquelin Perske
2,116936,76678,Claudia Karvan
3,116936,13073,John Edwards
4,116936,7906,New South Wales Film and Television Office


In [2]:
# replace all wikidata/acmi links available from wikidata.

link_data = pandas.read_parquet(pathlib.Path.cwd().parents[0] / 'data' / 'wikidata_link_data.parquet')

link_data_work = link_data.copy()
link_data_work = link_data_work.loc[link_data_work.wikidata_acmi_id.str.contains('work', na=False)]
link_data_work['work_id'] = link_data_work['wikidata_acmi_id'].str.split('/').str[-1]
work_replace = {x['work_id']:x['wikidata_id'] for x in link_data_work.to_dict('records')}

link_data_creator = link_data.copy()
link_data_creator = link_data_creator.loc[link_data_creator.wikidata_acmi_id.str.contains('creator', na=False)]
link_data_creator['creator_id'] = link_data_creator['wikidata_acmi_id'].str.split('/').str[-1]
creator_replace = {x['creator_id']:x['wikidata_id'] for x in link_data_creator.to_dict('records')}

acmi_data = acmi_data.replace({'work_id':work_replace})
acmi_data = acmi_data.replace({'creator_id':creator_replace})

# find instances where the work is linked, but the creator is not.

acmi_data = acmi_data.loc[acmi_data.work_id.str.contains('Q', na=False)]
acmi_data = acmi_data.loc[~acmi_data.creator_id.str.contains('Q', na=False)]

print(len(acmi_data))
acmi_data.head()

14120


Unnamed: 0,work_id,creator_id,creator_name
124,Q1803756,77685,Eric Heumann
126,Q1803756,35083,Paradise Television
127,Q1803756,35082,French Film Centre
128,Q1803756,35081,ETI
129,Q1803756,35080,Basic Cinematografica


In [3]:
# merge in existing wikidata ids where creator with the same name has worked on linked work.

creator_data = pandas.read_parquet(pathlib.Path.cwd().parents[0] / 'data' / 'wikidata_creator_data.parquet')
creator_data = creator_data[['wikidata_work_id', 'wikidata_creator_id', 'wikidata_creator_name']] 
creator_data = creator_data.rename(columns={'wikidata_work_id':'work_id', 'wikidata_creator_name':'creator_name'})
dataframe = pandas.merge(acmi_data, creator_data, on=['work_id', 'creator_name'], how='inner')
dataframe = dataframe.drop_duplicates()

dataframe = dataframe[['wikidata_creator_id', 'creator_id']].rename(columns={'creator_id':'acmi_creator_id'})
dataframe['acmi_creator_id'] = 'creators/'+dataframe['acmi_creator_id']

creator_match_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_matching.csv'
dataframe.to_csv(creator_match_path, index=False)

# issue here around duplicate wiki creator_ids

print(len(dataframe))
dataframe.head()

3120


Unnamed: 0,wikidata_creator_id,acmi_creator_id
0,Q446960,creators/66844
1,Q16104830,creators/8745
2,Q507519,creators/55523
3,Q5294199,creators/12797
4,Q4864769,creators/77922
