In [15]:
# find ACMI director creators and attempt matching titles with Wikipedia full-text.

import json
import numpy
import pandas
import pathlib
import pydash
import requests
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

acmi_path = pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'json' / 'works'
acmi_files = [filename for filename in acmi_path.iterdir() if filename.suffix == '.json' and 'index' not in filename.name]

directors = pandas.DataFrame(columns=['acmi_id', 'acmi_name'])

for acmi_file in tqdm.tqdm(acmi_files[:100]):

    with open(acmi_file, encoding='utf-8') as acmi_data:
        acmi_data = json.load(acmi_data)

    if 'creators_primary' in acmi_data:
        for y in acmi_data['creators_primary']:
            if 'role' in y:
                if y['role'] == 'director':
                    directors.loc[len(directors)] = [(y['creator_id']), (y['name'])]
#                     directors.append({'id':y['creator_id']})


query = '''
    select ?acmi_id ?wikidata_id
    where {
        ?wikidata_id wdt:P7003 ?acmi_id . 
        filter(regex(str(?acmi_id), "creators")) .
    } '''

wikidata_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

directors['acmi_id'] = 'creators/'+directors['acmi_id'].astype(str)
directors = directors.loc[~directors.acmi_id.isin(list(wikidata_creators.acmi_id))]

directors['P106'] = 'Q2526255'

directors.to_csv(pathlib.Path.home() / 'acmi_directors.csv', index=False)

print(len(directors))
directors.sample(10)

100%|██████████| 100/100 [00:00<00:00, 606.30it/s]


26


Unnamed: 0,acmi_id,acmi_name,P106
39,creators/22687,Harry Ratner,Q2526255
8,creators/75131,Bruce Mackay,Q2526255
33,creators/14917,Manu Simon,Q2526255
2,creators/34223,Janice Sutherland,Q2526255
10,creators/15810,Don Saunders,Q2526255
30,creators/12539,Nandor Jenes,Q2526255
7,creators/74639,Karl McPhee,Q2526255
6,creators/83924,Kanlaya Padungsin,Q2526255
0,creators/83676,Jessica Hobbs,Q2526255
29,creators/14798,Robert Churchill,Q2526255


In [8]:
# # find ACMI creators and attempt matching titles with Wikipedia full-text.

# import hashlib
# import json
# import numpy
# import pandas
# import pathlib
# import pydash
# import requests
# import time
# import tqdm

# def value_extract(row, column):

#     ''' Extract dictionary values. '''
    
#     return pydash.get(row[column], 'value')

# def sparql_query(query, service):

#     ''' Send sparql request, and formulate results into a dataframe. '''

#     response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
#     results = pydash.get(response.json(), 'results.bindings')
#     data_frame = pandas.DataFrame.from_dict(results)
#     for column in data_frame.columns:
#         data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
#     return data_frame


# # okay what you want to do is focus on a creator and make a call on whether they are already matched
# # and assess middle ground, but otherwise auto creater


# query = '''
#     select ?acmi_id ?wikidata_id
#     where {
#         ?wikidata_id wdt:P7003 ?acmi_id . 
#         filter(regex(str(?acmi_id), "creators")) .
#     } '''

# wikidata_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()

# acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
# acmi_creators = acmi_creators[['id', 'name']].rename(columns={'id':'acmi_id', 'name':'acmi_label'})
# acmi_creators['acmi_id'] = 'creators/'+acmi_creators['acmi_id'].astype(str)
# acmi_creators = acmi_creators.loc[~acmi_creators.acmi_id.isin(list(wikidata_creators.acmi_id))]







# # acmi_creators = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t', low_memory=False)
# # acmi_creators = acmi_creators.loc[acmi_creators.id.isin(directors)]
# # acmi_creators['id'] = 'creators/'+acmi_creators['id'].astype(str)

# # query = '''
# #     select ?id ?wikidata
# #     where {
# #         ?wikidata wdt:P7003 ?id .
# #         filter(regex(str(?id), "creators")) .
# #         } '''

# # wikidata = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
# # acmi_creators = pandas.merge(acmi_creators, wikidata, on='id', how='left').fillna('')
# # acmi_creators['wikidata'] = acmi_creators['wikidata'].str.split('/').str[-1]
# # acmi_creators = acmi_creators[['id', 'name', 'wikidata']]



# acmi_creators['P27'] = 'Q408'
# acmi_creators['P106'] = 'Q2526255'




# acmi_creators.to_csv(pathlib.Path.home() / 'acmicreators.csv', index=False)
# print(len(acmi_creators))
# acmi_creators.sample(10)

17616


Unnamed: 0,acmi_id,acmi_label,P27,P106
1265,creators/80398,Mary Montiforte,Q408,Q2526255
4734,creators/76901,Stanley S. Canter,Q408,Q2526255
16082,creators/79115,Michael Petin,Q408,Q2526255
13971,creators/86071,Sega,Q408,Q2526255
16050,creators/83901,Jeremy Somerville,Q408,Q2526255
3174,creators/80953,Jeff Canin,Q408,Q2526255
14315,creators/30226,Trilogy Entertainment Group,Q408,Q2526255
16872,creators/24047,Impressa Visual Productions,Q408,Q2526255
15945,creators/73309,Cinevex Film Laboratories,Q408,Q2526255
13441,creators/19779,Louise Jonas,Q408,Q2526255


In [20]:
# test = acmi_creators.copy()
# test = test.loc[test.wikidata.str.contains('Q', na=False)]
# print(len(test))
# test.head()

1346


Unnamed: 0,id,name,wikidata
2,creators/37128,Jeffrey Blitz,Q6175864
5,creators/30169,Mick Molloy,Q6838341
8,creators/79562,Andrew Fleming,Q504706
10,creators/83073,Gerard Blain,Q710654
15,creators/10227,George Whaley,Q25183124


In [8]:
# acmi_works = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'works.tsv', delimiter='\t', low_memory=False)
# acmi_works = pandas.concat([
#     acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id'}),
#     acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id'})
# ])

# acmi_works['creator_id'] = acmi_works['creator_id'].str.split(',')
# acmi_works = acmi_works.explode('creator_id')
# acmi_works['creator_id'] = acmi_works['creator_id'].str.strip()

# acmi_works = acmi_works.loc[~acmi_works.creator_id.isin([numpy.nan, None])]
# acmi_works['id'] = 'works/'+acmi_works['id'].astype(str)
# acmi_works['creator_id'] = 'creators/'+acmi_works['creator_id'].astype(str)

# query = '''
#     select ?id ?wikidata
#     where {
#         ?wikidata wdt:P7003 ?id .
#         filter(regex(str(?id), "works")) .
#         } '''

# wikidata = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
# acmi_works = pandas.merge(acmi_works, wikidata, on='id', how='left').fillna('')
# acmi_works['wikidata'] = acmi_works['wikidata'].str.split('/').str[-1]

# print(acmi_works.columns.values)



# print(len(acmi_works)) # 152179

# acmi_works.sample(10)


['id' 'title' 'creator_id' 'wikidata']
119891


Unnamed: 0,id,title,creator_id,wikidata
61561,works/80360,"Ambush at Masai Mara and Leopard, a Darkness i...",creators/74303,
34199,works/94202,Calle 54,creators/31624,Q2630071
7648,works/94121,Mortal thoughts,creators/36021,Q389791
102847,works/116996,The Lorne Theatre,creators/73899,
53159,works/77040,Italy: a Venetian menu,creators/19478,
40364,works/88629,Illegal abortion,creators/66965,
76098,works/67356,Faces in the sun,creators/12209,
83838,works/115822,Those unforgettable school days = 难忘中学时光,creators/82798,
28358,works/116797,The Whistling Eagle,creators/83550,
14050,works/69437,Knowing to learn,creators/74567,


In [17]:
# acmi_data = acmi_creators.copy()

# acmi_object = list()

# for x in acmi_data.to_dict('records'):
#     filmography = acmi_works.copy()
#     filmography = filmography.loc[filmography.creator_id.isin([x['id']])]
#     if len(filmography):
#         x['filmography'] = filmography.to_dict('records')
#         acmi_object.append(x)

# print(len(acmi_object))

# # dataframe.he\ad()

3503


In [37]:
# # # okay directors with no wikidata

# # # print(acmi_data[:4])




# # query = '''
# #     select distinct ?director ?directorLabel ?film ?filmLabel
# #     where {
# #         ?film wdt:P57 ?director .
# #         ?director wdt:P27 wd:Q408 .
# #     service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
# #         } '''

# # wikidata_director = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()




# creators_without_wikidata = [x for x in acmi_object if 'Q' not in x['wikidata']]

# print(len(creators_without_wikidata))

# formatted = list()

# for x in creators_without_wikidata:
#     # print(x)


#     filmography = [{'film_id':f['id'],'film_label':f['title'] } for f in x['filmography']]
#     formatted.append({'agent_id':x['id'], 'agent_label':x['name'], 'filmography':filmography})


# # print(json.dumps(formatted, indent=4, ensure_ascii=False))

# with open(pathlib.Path.home() / 'acmi_filmcreators.json', 'w') as thing:
#     json.dump(formatted, thing, indent=4, ensure_ascii=False)


# # [
# #     {
# #         "agent_id": "agent_001",
# #         "agent_label": "Jackie Weaver",
# #         "filmography": [
# #             {
# #                 "film_id": "film_001",
# #                 "film_label": "Picnic at Hanging Rock"
# #             },
# #             {
# #                 "film_id": "film_002",
# #                 "film_label": "The Removalists"
# #             },
# #             {
# #                 "film_id": "film_003",
# #                 "film_label": "Animal Kingdom"
# #             }
# #         ]
# #     }
# # ]

    
# #     if x['name'] in wikidata_director.directorLabel.unique():
        
# #         for y in wikidata_director


# #         print(x)


# # print(len(wikidata_director))
# # wikidata_director.head()


2157


In [4]:
# make a JSON with director, id and filmography of works as json / title, ids

# you are then looking for candidates to try and make match on creator primarily

# second level, if connection is made can you find likely or unlikely works
# eg Kay Roberts


# what you are doing is basic filmography matching, but wound down to one