# Mapping Person with Wikidata

Load Streetnames from Spacy output and delete all unnecessary columns

In [54]:
import pandas as pd
import numpy as np

streetnames = pd.read_csv('../Datapreparation/spacy_out.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA,SPACY_DE_ENT,SPACY_FR,SPACY_FR_LEMMA,SPACY_FR_ENT,STN_LANG
0,0,10023770,Wiedenweg,Wieden,Wieden,Wieden,,Wieden,Wieden,,de
1,1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,,Wuhrbärgli,Wuhrbärgli,,de
2,2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel,,Emanuelen,emanuelen,,de
3,3,10069457,Löhrweg,Löhr,Löhr,Löhr,,Löhr,Löhr,PER,de
4,4,10096235,Brunngasse,Brunn,Brunn,Brunn,,Brunn,Brunn,,de


In [55]:
streetnamesDe = streetnames[streetnames["STN_LANG"] == "de"]

In [56]:
#streetnamesDe[streetnamesDe["SPACY_DE"] != streetnamesDe["STN_LABEL_FINAL"]]

In [57]:
streetnamesDe = streetnamesDe[["STR_ESID", "STN_LABEL_FINAL"]]

In [58]:
streetnamesDe.head()

Unnamed: 0,STR_ESID,STN_LABEL_FINAL
0,10023770,Wieden
1,10179192,Wuhrbärgli
2,10140563,Emanuelen
3,10069457,Löhr
4,10096235,Brunn


Read a workingfile where allready queried values are stored as it is not possible to query allvalues at once

In [59]:
personLabels = ['wikiQLabel', 'wikiQ', 'sex', 'birth', "death", "placebirth", "placedeath", "image"]
streetnamesDeTempl = streetnamesDe.reindex(columns = streetnamesDe.columns.tolist() + personLabels)

In [60]:
from os.path import exists
workfile = "mapping wiki person_out.csv"
if exists(workfile):
    streetnamesDeWork = pd.read_csv(workfile, encoding='UTF-8-SIG', sep=';').drop("Unnamed: 0", axis = 1)
else:
    streetnamesDeTempl.to_csv(workfile, encoding='UTF-8-SIG', sep=';')
    streetnamesDeWork = streetnamesDeTempl.copy()


In [61]:
streetnamesDeWork.head()

Unnamed: 0,STR_ESID,STN_LABEL_FINAL,wikiQLabel,wikiQ,sex,birth,death,placebirth,placedeath,image
0,10000000,Dorf,,,,,,,,
1,10000001,Ebmatt,,,,,,,,
2,10000002,Erspel,,,,,,,,
3,10000005,Hüebli,,,,,,,,
4,10000006,Kapf,,,,,,,,


In [62]:
streetnamesDeWork = streetnamesDeWork.sort_values(["STR_ESID"])

In [63]:
findLastChecked = streetnamesDeWork.dropna(subset=["wikiQLabel"])

In [64]:
findLastChecked.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6430 entries, 5 to 124872
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STR_ESID         6430 non-null   int64  
 1   STN_LABEL_FINAL  6430 non-null   object 
 2   wikiQLabel       6430 non-null   object 
 3   wikiQ            6430 non-null   object 
 4   sex              5234 non-null   object 
 5   birth            2729 non-null   object 
 6   death            1656 non-null   object 
 7   placebirth       1956 non-null   object 
 8   placedeath       1337 non-null   object 
 9   image            0 non-null      float64
dtypes: float64(1), int64(1), object(8)
memory usage: 552.6+ KB


In [65]:
findLastChecked = findLastChecked.sort_values(["STR_ESID"], ascending = False)

In [66]:
lastCheckedESID = 0

if len(findLastChecked.index) > 0:
    lastCheckedESID = findLastChecked['STR_ESID'].loc[findLastChecked.index[0]]
    
print("Last Check ESDI:", lastCheckedESID)

Last Check ESDI: 10258652


---
## Initialising Wikidataquery

In [67]:
from SPARQLWrapper import SPARQLWrapper, JSON

wdUrl = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(wdUrl)

In [68]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [69]:
def queryPeopleName(sparql, subject):
    #print(wdKey)
    query = """
    SELECT ?subject ?subjectLabel ?sexLabel ?birth ?death ?placebirth ?placedeath ?imageLabel
    WHERE {
      ?subject rdfs:label "%s"@de;
               wdt:P31 wd:Q5;   
      OPTIONAL {?subject wdt:P21 ?sex;}
      OPTIONAL {?subject wdt:P569 ?birth;}
      OPTIONAL {?subject wdt:P570 ?death;}
      OPTIONAL {?subject wdt:P19 ?placebirth;}
      OPTIONAL {?subject wdt:P20 ?placedeath.}
      OPTIONAL {?subject wdt:P18 ?image.}
      SERVICE wikibase:label { bd:serviceParam wikibase:language "de" . }   
    }
    """ % (subject)
    #print(query)
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql

Let's query

In [70]:
from time import sleep
from urllib.error import HTTPError
import os

i = 0
z=0
total = int(len(streetnamesDeWork.index))

maxresults = 100000
        
for x in streetnamesDeWork.index:
    esid = streetnamesDeWork['STR_ESID'][x]
    print(esid)
    if int(esid) > lastCheckedESID:
        i+=1
    else:
        z+=1
        print(z)
        continue
        
    #print(i)
    subject = streetnamesDeWork['STN_LABEL_FINAL'][x]
    #print(f"Subjekt: {subject}")

    sparql = queryPeopleName(sparql, subject)
    
    update_progress(i / maxresults)
    try:
        results = sparql.query()
        #print(results.info())
    except HTTPError as e:
        print(f"{i} Anfragen ausgeführt")
        #Prüfen auf Statuscode 429 (Too many Requests)
        if e.status == 429:
            print(f'Statuscode 429 aufgetreten: Anfrage geht in {e.headers.get("retry-after")}sec weiter')
            sleep(int(e.headers.get("retry-after"))+2)
            i-=1
            continue
        else:
            raise

    result = results.convert()
    #print(result)
    results_df = pd.json_normalize(result['results']['bindings'])
    #print(results_df)
    
    if not results_df.empty:
               
        STR_ESID = streetnamesDeWork['STR_ESID'][x]
        STN_LABEL_FINAL = streetnamesDeWork['STN_LABEL_FINAL'][x]
        wikiQ = results_df['subject.value'][0]
        wikiQLabel = results_df['subjectLabel.value'][0]
        sex = results_df['sexLabel.value'][0] if 'sexLabel.value' in results_df else np.nan
        birth = results_df['birth.value'][0] if 'birth.value' in results_df else np.nan
        death = results_df['death.value'][0] if 'death.value' in results_df else np.nan
        placebirth = results_df['placebirth.value'][0] if 'placebirth.value' in results_df else np.nan
        placedeath = results_df['placedeath.value'][0] if 'placedeath.value' in results_df else np.nan
        image = results_df['image.value'][0] if 'image.value' in results_df else np.nan

        
        values = [STR_ESID, STN_LABEL_FINAL, wikiQ, wikiQLabel, sex, birth, death, placebirth, placedeath, image]
        valuesChecked = []
        #for value in valuesToCheck:
            #print("hhM", value)
            #if not value:
                #valuesChecked.append(np.nan)
            #for match only one value
            #elif value.count() == 1:
                #valuesChecked.append(value.item())
            #else:
            #for return multiple values
                #valuesChecked.append(value.iloc[0])
            
            
            #if value:
                #valuesChecked.append(value.iloc[0])
            #else:
                #valuesChecked.append(np.nan)
                
        streetnamesDeWork.loc[x] = values
        #print(f"{subjectStr} | {wikiQLabel}: {wikiQ} -> {instance}")


    #print(x)
        streetnamesDeWork.to_csv(workfile, encoding='UTF-8-SIG', sep=';')
    
    if i > maxresults:
        break
    

Progress: [--------------------] 0.0%


In [18]:
streetnamesDeWork.head()

Unnamed: 0,STR_ESID,STN_LABEL_FINAL,wikiQLabel,wikiQ,sex,birth,death,placebirth,placedeath,image
0,10000000,Dorf,,,,,,,,
1,10000001,Ebmatt,,,,,,,,
2,10000002,Erspel,,,,,,,,
3,10000005,Hüebli,,,,,,,,
4,10000006,Kapf,,,,,,,,


---

In [None]:
def queryPeople(sparql, subject):
    #print(wdKey)
    query = """
    SELECT DISTINCT ?item ?itemLabel ?dateOfBirth ?sex
    WHERE {
      hint:Query hint:optimizer "None".
      SERVICE wikibase:mwapi {
        bd:serviceParam wikibase:api "Search";
                        wikibase:endpoint "www.wikidata.org";
                        mwapi:srsearch "%s haswbstatement:P31=Q5".
        ?item wikibase:apiOutputItem mwapi:title .
      }
      OPTIONAL {?item wdt:P569 ?dateOfBirth  . }
      OPTIONAL {?item wdt:P21 ?sex  . }
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
    }
    """ % (subject)
    #print(query)
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql

In [None]:
import requests
  
def get_retry_after():
    response = requests.get(wdUrl)
    retry_after = response.getheader("Retry-After")

    if retry_after is None:
        return None

    print(retry_after)

In [None]:
i = 0
dfr = pd.DataFrame(columns=["Name", "wikiQLabel", "wikiQ", "instance"])

maxresults = 10

for x in only_more_2_words.index:
    i+=1
    #print(i)
    subject = only_more_2_words['STN_LABEL_FINAL'][x]
    subjectStr = only_more_2_words['STN_LABEL'][x]
    #print(f"Subjekt: {subject}")
    sparql = queryWd(sparql, subject)
    #print(sparql)
    try:
        results = sparql.query()
        #print(results.info())
    except Exception as e:
        #Prüfen auf Statuscode 429 (Too many Requests). Leider kann der Statuscode nicht abgerufen werden..
        print("Sollte ein Statuscode 429 auftreten: Anfrage in ca. 30sec wiederholen")
        print(e)
        get_retry_after()
        break

    result = results.convert()
    #print(result)

    results_df = pd.json_normalize(result['results']['bindings'])
    #print(results_df)

    if not results_df.empty:
        wikiQ = results_df['subject.value'][0]
        wikiQLabel = results_df['subjectLabel.value'][0]
        instance = results_df['instanceLabel.value'][0]

        #print(f"{subjectStr} | {wikiQLabel}: {wikiQ} -> {instance}")
        dfrtemp = pd.DataFrame([[subjectStr, wikiQLabel, wikiQ, instance]], columns=dfr.columns)
        dfr = pd.concat([dfr, dfrtemp])

    else:
        #print(f"{subjectStr} | {subject}: Kein Eintrag in Wikidata gefunden")
        dfrtemp = pd.DataFrame([[subjectStr, subject, np.nan, np.nan]], columns=dfr.columns)
        dfr = pd.concat([dfr, dfrtemp])

    #print(x)
    if i > maxresults:
        break

In [None]:
i = 0
dfr = pd.DataFrame(columns=["Name", "wikiQLabel", "wikiQ", "dateBirth", "sex"])

maxresults = 50000

for x in only_more_2_words.index:
    i+=1
    #print(i)
    subject = only_more_2_words['STN_LABEL_FINAL'][x]
    subjectStr = only_more_2_words['STN_LABEL'][x]
    #print(f"Subjekt: {subject}")
    sparql = queryPeople(sparql, subject)
    #print(sparql)
    try:
        results = sparql.query()
        #print(results.info())
    except Exception as e:
        #Prüfen auf Statuscode 429 (Too many Requests). Leider kann der Statuscode nicht abgerufen werden..
        print("Sollte ein Statuscode 429 auftreten: Anfrage in ca. 30sec wiederholen")
        print(e)
        get_retry_after()
        break

    result = results.convert()
    #print(result)

    results_df = pd.json_normalize(result['results']['bindings'])
    #print(results_df)

    if not results_df.empty:
        wikiQ = results_df['item.value'][0]
        wikiQLabel = results_df['itemLabel.value'][0]
        try:
            dateBirth = results_df['dateOfBirth.value'][0]
        except:
            dateBirth = '1798-06-08T00:00:00Z'
        try:
            sex = results_df['sex.type'][0]
        except:
            sex = 'uri'


        print(f"{subjectStr} | {subject}: {wikiQ} -> {instance} / {dateBirth} ")
        dfrtemp = pd.DataFrame([[subjectStr, wikiQLabel, wikiQ, dateBirth, sex]], columns=dfr.columns)
        dfr = pd.concat([dfr, dfrtemp])

    else:
        #print(f"{subjectStr} | {subject}: Kein Eintrag in Wikidata gefunden")
        dfrtemp = pd.DataFrame([[subjectStr, subject, np.nan, np.nan, np.nan]], columns=dfr.columns)
        dfr = pd.concat([dfr, dfrtemp])

    #print(x)
    if i > maxresults:
        break

In [None]:
dfr.head(10)

In [None]:
labelna = streetnames.dropna(subset=['STN_LABEL_FINAL'])
only_more_2_words = labelna.loc[(labelna.STN_LABEL_FINAL.str.contains(" "))]
only_more_2_words


In [None]:
dfr.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Datapreparation\wiki.csv', encoding='UTF-8-SIG', sep=';')