In [1]:
# validate data statements, starting with birth date.

import json
import pandas
import pathlib
import pydash
import requests
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    df = pandas.DataFrame.from_dict(results)
    for column in df.columns:
        df[column] = df.apply(value_extract, column=column, axis=1)
    
    return df

# birthdates for synced creators from wikidata.

query = ''' 
    select ?acmi_creator ?wikidata_birthdate where { 
        ?wd wdt:P7003 ?acmi_creator .
        filter(regex(str(?acmi_creator), "creator"))
        ?wd wdt:P569 ?wikidata_birthdate .
    } '''

wikidata_birth = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
wikidata_birth['wikidata_birthdate'] = wikidata_birth['wikidata_birthdate'].str[:10]

print(len(wikidata_birth))
wikidata_birth.head()

4935


Unnamed: 0,acmi_creator,wikidata_birthdate
0,creators/66943,1928-07-26
1,creators/72561,1899-08-13
2,creators/66876,1915-05-06
3,creators/25121,1920-03-21
4,creators/3886,1934-07-01


In [2]:
acmi_birth = pandas.read_csv(pathlib.Path.cwd().parents[0] / 'acmi-api' / 'app' / 'tsv' / 'creators.tsv', delimiter='\t')
acmi_birth['id'] = 'creators/'+acmi_birth['id'].astype(str)
acmi_birth = acmi_birth[['id', 'date_of_birth']].rename(columns={'id':'acmi_creator', 'date_of_birth':'acmi_birthdate'})
acmi_birth = acmi_birth.dropna()
acmi_birth = acmi_birth[acmi_birth.acmi_birthdate.apply(lambda x: len(str(x))==10)]

print(len(acmi_birth))
acmi_birth.head()

1600


Unnamed: 0,acmi_creator,acmi_birthdate
12,creators/76441,1944-03-06
22,creators/78485,1953-06-17
66,creators/79562,1965-12-30
84,creators/83073,1930-10-23
99,creators/26997,1941-01-01


In [3]:
compare = pandas.merge(acmi_birth, wikidata_birth, on='acmi_creator', how='inner')
compare = compare.loc[compare.acmi_birthdate != compare.wikidata_birthdate]
print(len(compare))
compare.head()

107


Unnamed: 0,acmi_creator,acmi_birthdate,wikidata_birthdate
2,creators/79562,1965-12-30,1963-03-14
16,creators/24342,1897-04-23,1900-01-01
17,creators/24342,1897-04-23,1900-04-23
19,creators/83123,1953-09-01,1959-09-01
24,creators/76458,1932-08-11,1932-01-01
