In [1]:
from collections import deque, namedtuple
from IPython.core.debugger import set_trace
from IPython.display import display
import pandas as pd
import json
import os

In [2]:
CONFIG = 'last_save.json'
MAIN = 'philosopher.sparql'
BIRTH = 'birthPlace.sparql'

In [3]:
with open(CONFIG, 'r') as f:
    files = json.load(f)
files

{'work.sparql': '_data/work.sparql_Sun Mar 15 21:35:07 2020.json',
 'school.sparql': '_data/school.sparql_Sun Mar 15 21:35:10 2020.json',
 'philosopher.sparql': '_data/main_philosophers.sparql_Sun Mar 15 21:35:22 2020.json',
 'era.sparql': '_data/era.sparql_Sun Mar 15 21:35:42 2020.json',
 'birthPlace.sparql': '_data/birthPlace.sparql_Sun Mar 15 21:35:49 2020.json',
 'notableIdea.sparql': '_data/notableIdea.sparql_Sun Mar 15 21:35:59 2020.json',
 'mainInterest.sparql': '_data/mainInterest.sparql_Sun Mar 15 21:36:05 2020.json',
 'influenced.sparql': '_data/influenced.sparql_Sun Mar 15 21:36:26 2020.json'}

In [4]:
Sample = namedtuple('Sample', ['bindings', 'head'])
SAMPLES = {}

for file, name in files.items():
    with open(name, 'r') as f:
        data = json.load(f)
        SAMPLES[file] = Sample(data["results"]["bindings"][:50], data["head"]["vars"])
        del data

In [5]:
def parse_bindings(bindings, head, name):
    Result = namedtuple(name.split('.')[0], head, defaults=[None for key in head])
    res = deque()
    for binding in bindings:
        res.append(Result(**{key: data["value"] for key, data in binding.items()}))
    return res

def parse_sample(sample):
    return parse_bindings(sample.bindings, sample.head, type(sample).__name__)

sample_main = parse_sample(SAMPLES[MAIN])

In [6]:
def pick_not_none(data):
    def pick(c, n):
        if c is None:
            return n
        try:
            if len(n) > len(c):
                return n
        except:
            return c
    
    picked = [None for _ in data.columns]
    for datum in data.itertuples(index=False):
        picked = [
            pick(datum[i], picked[i])
            for i in range(len(picked))
        ]
        if all(val is not None for val in picked):
            return pd.Series(picked, index=data.columns)
    return pd.Series(picked, index=data.columns)

pick_not_none(pd.DataFrame([['a', None], [None, 'b']], columns=['q', 'w']))

q    a
w    b
dtype: object

In [7]:
def parse_main(parsed):
    NAME_COLS = ['name', 'name2', 'birthName']
    ID = 'wikiPageID'
    NR_COLS = [ID, 'abstract', 'gender']

    df = pd.DataFrame(parsed)
    
    names = deque()
    for datum in df[[ID, *NAME_COLS]].itertuples(index=False):
        id_ = datum[0]
        [
            names.append((id_, name))
            for name in datum[1:]
            if name is not None and len(name) > 2
        ]
    df_names = pd.DataFrame(names, columns=[ID, 'name']).drop_duplicates()
    
    df_birthday = df[['wikiPageID', 'birthDate', 'deathDate']] \
        .groupby(['wikiPageID']).apply(pick_not_none) \
        .drop(ID, axis=1).dropna(how='all').reset_index()
    df_nationality = df[['wikiPageID', 'nationality']].dropna().drop_duplicates()
    df_phil = df[NR_COLS].drop_duplicates()
    
    return df_phil, df_names, df_birthday, df_nationality

[display(df.head(5)) for df in parse_main(sample_main)]

Unnamed: 0,wikiPageID,abstract,gender
0,1731084,Aaron David Gordon (Hebrew: אהרן דוד גורדון‎‎;...,male
8,2018,"Sir Alfred Jules ""Freddie"" Ayer (/ɛər/; 29 Oct...",male
12,36243692,"A Satyanarayana Shastri (June 2, 1925 – Januar...",male
16,41780272,Aaron Ben-Ze'ev (born 30 July 1949) is an Isra...,male
17,2280663,Abas (Greek: Ἄβας) was an ancient Greek sophis...,male


Unnamed: 0,wikiPageID,name
0,1731084,A. D. Gordon
1,1731084,Aaron David Gordon
8,2018,A. J. Ayer
9,2018,Alfred Jules Ayer
10,2018,Sir A. J. Ayer


Unnamed: 0,wikiPageID,birthDate,deathDate
0,11340731,1855-1-1,1902-1-1
1,1731084,1856-06-09,1922-02-22
2,2018,1910-10-29,1989-06-27
3,20844942,1931-01-09,2012-10-03
4,20990958,1935-09-19,2015-12-08


Unnamed: 0,wikiPageID,nationality
12,36243692,Indian
48,20990958,Indian


[None, None, None, None]

In [8]:
def parse_object_name(object_name):
    return object_name.split('/')[-1].replace("_", ' ')

def parse_birth_death(parsed):
    df = pd.DataFrame(parsed)
    ID = 'wikiPageID'
    BIRTH = 'birthPlace', 'birthPlace_wikiPageID'
    DEATH = 'deathPlace', 'deathPlace_wikiPageID'
    
    birth_df = df[[ID, *BIRTH]].drop_duplicates().dropna()
    death_df = df[[ID, *DEATH]].drop_duplicates().dropna()
    birth_df.birthPlace = birth_df.birthPlace.apply(parse_object_name)
    death_df.deathPlace = death_df.deathPlace.apply(parse_object_name)
    
    return birth_df, death_df

[display(df.head()) for df in parse_birth_death(parse_sample(SAMPLES[BIRTH]))]

Unnamed: 0,wikiPageID,birthPlace,birthPlace_wikiPageID
0,1731084,Zhytomyr,1025103
2,1731084,Russian Empire,20611504
4,2018,England,9316
6,2018,London,17867
17,9008105,Kadhimiya,695173


Unnamed: 0,wikiPageID,deathPlace,deathPlace_wikiPageID
0,1731084,Degania Alef,1601743
1,1731084,British Mandate of Palestine,27276911
4,2018,England,9316
5,2018,London,17867
8,36243692,Karnataka,16880


[None, None]