# Mapping Person with Wikidata

## Preparation
### Load Data

Load Streetnames from Spacy output and remove all unnecessary columns

In [19]:
import pandas as pd
import numpy as np

streetnames = pd.read_csv('../Datapreparation/spacy_out.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA,SPACY_DE_ENT,SPACY_FR,SPACY_FR_LEMMA,SPACY_FR_ENT,STN_LANG
0,0,10023770,Wiedenweg,Wieden,Wieden,Wieden,,Wieden,Wieden,,de
1,1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,,Wuhrbärgli,Wuhrbärgli,,de
2,2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel,,Emanuelen,emanuelen,,de
3,3,10069457,Löhrweg,Löhr,Löhr,Löhr,,Löhr,Löhr,PER,de
4,4,10096235,Brunngasse,Brunn,Brunn,Brunn,,Brunn,Brunn,,de


In [20]:
streetnames = streetnames[["STR_ESID", "STN_LABEL_FINAL", "STN_LANG"]]

In [21]:
streetnames.head()

Unnamed: 0,STR_ESID,STN_LABEL_FINAL,STN_LANG
0,10023770,Wieden,de
1,10179192,Wuhrbärgli,de
2,10140563,Emanuelen,de
3,10069457,Löhr,de
4,10096235,Brunn,de


---
### Define some language independendly functions

In [22]:
personLabels = ['wikiQLabel', 'wikiQ', 'sex', 'birth', "death", "placebirth", "placedeath", "image"]

In [23]:
# values for lang = de, fr, it or rm
def createDeTemplDf(lang, df):
    df = df[df["STN_LANG"] == lang]
    df = df[["STR_ESID", "STN_LABEL_FINAL"]]
    df = df.reindex(columns = df.columns.tolist() + personLabels)
    streetnamesTempl = df.sort_values(["STR_ESID"])
    return streetnamesTempl

In [24]:
def findeLastCheckedEsid(df):
    df = df.dropna(subset=["wikiQLabel"])
    df = df.sort_values(["STR_ESID"], ascending = False)
    lastCheckedESID = 0

    if len(df.index) > 0:
        lastCheckedESID = df['STR_ESID'].loc[df.index[0]]

    return lastCheckedESID

---
### Initialising Wikidataquery

In [25]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [76]:
from SPARQLWrapper import SPARQLWrapper, JSON

def createQueryPeopleName(subject, lang):
    
    wdUrl = "https://query.wikidata.org/sparql"
    user_agent = 'Streetnamequery/1.0 (https://github.com/CaptainInler/cassda-zertifikatsarbeit)'
    sparql = SPARQLWrapper(wdUrl, agent=user_agent)
    
    #print(wdKey)
    query = """
    SELECT ?subject ?subjectLabel ?sexLabel ?birth ?death ?placebirth ?placedeath ?imageLabel
    WHERE {
      ?subject rdfs:label "%s"@%s;
               wdt:P31 wd:Q5;   
      OPTIONAL {?subject wdt:P21 ?sex;}
      OPTIONAL {?subject wdt:P569 ?birth;}
      OPTIONAL {?subject wdt:P570 ?death;}
      OPTIONAL {?subject wdt:P19 ?placebirth;}
      OPTIONAL {?subject wdt:P20 ?placedeath.}
      OPTIONAL {?subject wdt:P18 ?image.}
      SERVICE wikibase:label { bd:serviceParam wikibase:language "%s" . }   
    }
    LIMIT 1
    """ % ((subject), (lang), (lang))
    #print(query)
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql

In [77]:
def runQuery(subject, lang):

    sparql = createQueryPeopleName(subject, lang)
    results = ""

    for i in range(0,5):
        try:
            results = sparql.query()
            #print(results.info())
            break
        except HTTPError as e:
            print(f"{i} Anfragen ausgeführt")
            #Prüfen auf Statuscode 429 (Too many Requests)
            if e.status == 429:
                print(f'Statuscode 429 aufgetreten: Anfrage geht in {e.headers.get("retry-after")}sec weiter')
                sleep(int(e.headers.get("retry-after"))+2)

                continue
            else:
                raise

    result = results.convert()
    #print(result)
    results_df = pd.json_normalize(result['results']['bindings'])
    #print(results_df)
    return results_df

## Query streetnames in CH-DE

In [78]:
workfile = "mapping wiki person_de_out.csv"
workfile = "mut.csv"
lang = "de"

Read the workingfile where allready queried values are stored or create an empty one. The working file is necessary as it is not possible to query all values at once ⏳😴


In [79]:
from os.path import exists

if exists(workfile):
    streetnamesDeWork = pd.read_csv(workfile, encoding='UTF-8-SIG', sep=';').drop("Unnamed: 0", axis = 1)
else:
    print("no")
    streetnamesDeWork = createDeTemplDf(lang, streetnames)
    streetnamesDeWork.to_csv(workfile, encoding='UTF-8-SIG', sep=';')

  streetnamesDeWork = pd.read_csv(workfile, encoding='UTF-8-SIG', sep=';').drop("Unnamed: 0", axis = 1)


In [80]:
streetnamesDeWork.head(10)

Unnamed: 0,STR_ESID,STN_LABEL_FINAL,wikiQLabel,wikiQ,sex,birth,death,placebirth,placedeath,image
0,10000000,Dorf,,,,,,,,
1,10000001,Ebmatt,,,,,,,,
2,10000002,Erspel,,,,,,,,
3,10000005,Hüebli,,,,,,,,
4,10000006,Kapf,,,,,,,,
5,10000007,Kirch,http://www.wikidata.org/entity/Q102281568,Kirch,männlich,,,,,
6,10000008,Kämmoos,,,,,,,,
7,10000009,Mittlistberg,,,,,,,,
8,10000010,Pommern,,,,,,,,
9,10000011,Rotenstein,,,,,,,,


get the highest esid which allready has been checked to continue there

In [81]:
lastCheckedESID = findeLastCheckedEsid(streetnamesDeWork)
print("Last Check ESDI:", lastCheckedESID)

Last Check ESDI: 10000480


Let's go 🚀

In [None]:
i = 0
total = int(len(streetnamesDeWork.index))


for rowNr in streetnamesDeWork.index:
    i+=1
    
    esid = streetnamesDeWork['STR_ESID'][rowNr]
    #print(esid)
    if int(esid) < lastCheckedESID:
        continue

    subject = streetnamesDeWork['STN_LABEL_FINAL'][rowNr]
    #print(f"Subjekt: {subject}")
    
    
    results_df = runQuery(subject, lang)
    
    if not results_df.empty:

        STR_ESID = streetnamesDeWork['STR_ESID'][rowNr]
        STN_LABEL_FINAL = streetnamesDeWork['STN_LABEL_FINAL'][rowNr]
        wikiQ = results_df['subject.value'][0]
        wikiQLabel = results_df['subjectLabel.value'][0]
        sex = results_df['sexLabel.value'][0] if 'sexLabel.value' in results_df else np.nan
        birth = results_df['birth.value'][0] if 'birth.value' in results_df else np.nan
        death = results_df['death.value'][0] if 'death.value' in results_df else np.nan
        placebirth = results_df['placebirth.value'][0] if 'placebirth.value' in results_df else np.nan
        placedeath = results_df['placedeath.value'][0] if 'placedeath.value' in results_df else np.nan
        image = results_df['image.value'][0] if 'image.value' in results_df else np.nan


        values = [STR_ESID, STN_LABEL_FINAL, wikiQ, wikiQLabel, sex, birth, death, placebirth, placedeath, image]               
        streetnamesDeWork.loc[rowNr] = values
        #print(f"{subjectStr} | {wikiQLabel}: {wikiQ} -> {instance}")


        #print(x)
        streetnamesDeWork.to_csv(workfile, encoding='UTF-8-SIG', sep=';')
        
        update_progress(i / total)


---

## And now the same for the Romandie 