In [1]:
# find ACMI director creators and attempt matching titles with Wikipedia full-text.

import json
import numpy
import pandas
import pathlib
import pydash
import requests
import tqdm

acmi_path = pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'json' / 'works'
acmi_files = [filename for filename in acmi_path.iterdir() if filename.suffix == '.json' and 'index' not in filename.name]

directors = list()

for acmi_file in tqdm.tqdm(acmi_files):

    with open(acmi_file, encoding='utf-8') as acmi_data:
        acmi_data = json.load(acmi_data)

    if 'creators_primary' in acmi_data:
        for y in acmi_data['creators_primary']:
            if 'role' in y:
                if y['role'] == 'director':
                    directors.append(y['creator_id'])

print(len(pydash.uniq(directors)))

acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
acmi_creators = acmi_creators.loc[acmi_creators.id.isin(directors)]

print(len(acmi_creators))
acmi_creators.sample(10)

100%|██████████| 42921/42921 [00:51<00:00, 826.36it/s] 


7547
3499


Unnamed: 0,id,name,also_known_as,date_of_birth,date_of_death,places_of_operation,biography,biography_author,date_of_biography,external_links,uuid,source,source_identifier,external_references,date_modified
5935,16047,James Robertson,,,,,,,2023-05-09T14:40:41.515382+10:00,,d3808856-97bd-4381-84ca-ba0e835368c7,Vernon,4105.0,,2023-06-14T16:33:30.576250+10:00
8237,75455,Vincent Vaitiekunas,,,,,,,2023-05-09T16:23:49.717747+10:00,,2ccabd47-6abc-4566-b05a-5da3d95d368a,Vernon,30148.0,,2023-06-14T16:33:30.576250+10:00
4912,80774,Walerian Borowczyk,,,,,,,2023-05-09T19:24:35.173541+10:00,,75c96e22-91b2-4d7b-9539-8be276c9cce2,Vernon,30220.0,,2023-06-14T16:33:30.576250+10:00
12238,75201,Joe Bas,,,,,,,2023-05-09T15:39:56.084941+10:00,,20644e07-c642-412a-b29b-45a74696f79f,Vernon,22022.0,,2023-06-14T16:33:30.576250+10:00
12745,78693,David Geddes,,,,,,,2023-05-09T14:24:28.013373+10:00,,f245ea1f-86d5-47aa-ac48-a9e0c6f34d4f,Vernon,17588.0,,2023-06-14T16:33:30.576250+10:00
12519,82739,Liu Jialiang,,,,,,,2023-05-09T18:45:35.393181+10:00,,e83ad570-8e98-4c7c-bbe2-2758fc1bc158,Vernon,36173.0,,2023-06-14T16:33:30.576250+10:00
1417,23846,Del Jack,,,,,,,2023-05-09T19:13:51.605014+10:00,,2e79604e-b66e-482a-8328-9a3009306da3,Vernon,17936.0,,2023-06-14T16:33:30.576250+10:00
4498,6795,Scott Millwood,,,,,,,2023-05-09T17:38:52.233857+10:00,,b522a921-ff2a-49a3-bd90-671a15cef483,Vernon,110684.0,,2023-06-14T16:33:30.576250+10:00
19892,74676,John King,,,,,,,2023-05-09T19:53:57.945153+10:00,,72615c31-7d7f-429b-8f85-a0bd76085754,Vernon,22352.0,,2023-06-14T16:33:30.576250+10:00
4478,74532,Don Arioli,,,,,,,2023-05-09T14:59:11.990374+10:00,,fd5eab05-225a-4093-9add-a39423447228,Vernon,18185.0,,2023-06-14T16:33:30.576250+10:00


In [5]:
acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
# acmi_creators = acmi_creators.loc[acmi_creators.id.isin(directors)]

acmi_works = pandas.concat([
    acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id'}),
    acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id'})

])

acmi_works['creator_id'] = acmi_works['creator_id'].str.split(',')
acmi_works = acmi_works.explode('creator_id')

acmi_works['creator_id'] = acmi_works['creator_id'].str.strip()

print(acmi_works.columns.values)

print(len(acmi_works))
acmi_works.sample(10)


['id' 'title' 'creator_id']
152179


Unnamed: 0,id,title,creator_id
40720,108054,The Great Australian cultural exchange programme,6606.0
6365,115319,Columbus and the age of discovery,76286.0
37848,94031,A State funeral for Alec Campbell: the last Anzac,11940.0
26546,108296,Postcard from Tunis,73372.0
20569,110267,Time for Rita,10590.0
15801,69063,Introduction to reaction kinetics,
40765,70941,Nobody's victim,16058.0
21559,69032,Introducing the atom,12726.0
17122,94085,Toy story 2 [DVD],75186.0
25987,94052,Finders keepers,11940.0
