In [2]:
import hashlib
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

query = '''
    select ?acmi ?wikidata ?wikipedia
    where {
        ?wikidata wdt:P7003 ?acmi .
        filter(regex(str(?acmi), "creator")) .
        ?wikipedia schema:about ?wikidata .
        ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
        } '''

dataframe = sparql_query(query, 'https://query.wikidata.org/sparql')

for x in tqdm.tqdm(dataframe.to_dict('records')):
    link_hash = hashlib.md5(x['wikipedia'].encode()).hexdigest()
    summary_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_summary_extract' / f'{link_hash}.txt'
    summary_path.parents[0].mkdir(exist_ok=True, parents=True)
    if not summary_path.exists():
        time.sleep(2)
        wikipedia_query = requests.get(f"https://en.wikipedia.org/api/rest_v1/page/summary/{pathlib.Path(x['wikipedia']).name}")
        wikipedia_query = json.loads(wikipedia_query.text)
        with open(summary_path, 'w') as export:
            export.write(wikipedia_query['extract'])

summary_dataframe = pandas.DataFrame(columns=['wikipedia', 'wikipedia_summary'])
for x in tqdm.tqdm(dataframe.to_dict('records')):
    link_hash = hashlib.md5(x['wikipedia'].encode()).hexdigest()
    summary_path = pathlib.Path.cwd().parents[0] / 'data' / 'creator_summary_extract' / f'{link_hash}.txt'
    if summary_path.exists():
        with open(summary_path) as summary:
            summary = summary.read()
            summary_dataframe.loc[len(summary_dataframe)] = [(x['wikipedia']), (summary)]

dataframe = pandas.merge(dataframe, summary_dataframe, on='wikipedia', how='left').drop_duplicates()
with open(pathlib.Path.home() / 'Desktop' / 'creator_summary_extract.json', 'w') as export:
    json.dump(dataframe.to_dict('records'), export, indent=4, ensure_ascii=False)

print(len(dataframe))
dataframe.head()

100%|██████████| 3991/3991 [00:00<00:00, 11595.13it/s]
100%|██████████| 3991/3991 [00:03<00:00, 1069.43it/s]


3991


Unnamed: 0,wikidata,acmi,wikipedia,wikipedia_summary
0,http://www.wikidata.org/entity/Q2071,creators/4092,https://en.wikipedia.org/wiki/David_Lynch,"David Keith Lynch is an American filmmaker, vi..."
1,http://www.wikidata.org/entity/Q5603,creators/63905,https://en.wikipedia.org/wiki/Andy_Warhol,"Andy Warhol was an American visual artist, fil..."
2,http://www.wikidata.org/entity/Q41148,creators/3485,https://en.wikipedia.org/wiki/Martin_Scorsese,Martin Charles Scorsese is an American and Ita...
3,http://www.wikidata.org/entity/Q2001,creators/66943,https://en.wikipedia.org/wiki/Stanley_Kubrick,"Stanley Kubrick was an American film director,..."
4,http://www.wikidata.org/entity/Q19504,creators/62561,https://en.wikipedia.org/wiki/Fritz_Lang,"Friedrich Christian Anton Lang, better known a..."
