# Classifying streets using dictionaries

In [1]:
#Load all needed libraries
import pandas as pd
from time import process_time_ns
import ety
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
import numpy as np
from functools import reduce  

## Load all input data

In [2]:
streetnames = pd.read_csv('../Datapreparation/streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()  

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,STN_LABEL_NO_TERMS,STR_TERMS,STR_PREPS,STN_LABEL_FINAL
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,Wieden,weg,,Wieden
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,Wuhrbärgli,,,Wuhrbärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,Emanuelen,weg,,Emanuelen
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,Löhr,weg,,Löhr
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,Brunn,gasse,,Brunn


In [3]:
wictionary = pd.read_csv('wiktionary.csv',encoding='UTF-8-SIG', sep=';')
wictionary.head() 

Unnamed: 0,DIC_UID,DIC_PLANTS,DIC_PLA_LANG,DIC_ANIMALS,DIC_ANI_LANG,DIC_PROF,DIC_PRO_LANG,DIC_GEO,DIC_GEO_LANG
0,1,Ackergauchheil,DE,Aal,DE,"Abbrucharbeiter, Abbrucharbeiterin",DE,Aalenium,DE
1,2,Ackerröte,DE,Aalmutter,DE,"Abbruchmeister, Abbruchmeisterin",DE,Abbau,DE
2,3,Ackerschotendotter,DE,Aalrutte,DE,"Abbruchtechniker, Abbruchtechnikerin",DE,Abbaugerechtigkeit,DE
3,4,Adelgras,DE,Aaskäfer,DE,Abdecker,DE,Abbauwürdigkeit,DE
4,5,Adlerfarn,DE,Aberdeenrind,DE,"Abfallbeseitiger, Abfallbeseitigerin",DE,Abbild,DE


## Create classification table

In [4]:
classif = streetnames

In [5]:
classif.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [6]:
classif = classif.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)
classif

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL
0,10023770,Wiedenweg,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen
3,10069457,Löhrweg,Löhr
4,10096235,Brunngasse,Brunn
...,...,...,...
171867,10107114,Tannenstrasse,Tannen
171868,10070806,Rebenstrasse,Reben
171869,10142291,Fritz-Gegauf-Strasse,Fritz Gegauf
171870,10093237,Obere Bleichewiese,Bleichewiese


In [7]:
missing_data = pd.DataFrame(
    classif.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

classif = classif.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             415


## Mapping wictionary lists
https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Tiere

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Pflanzen

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Berufe

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Geowissenschaften

### Plants

In [8]:
plants = wictionary.DIC_PLANTS.dropna()
plants

0           Ackergauchheil
1                Ackerröte
2       Ackerschotendotter
3                 Adelgras
4                Adlerfarn
               ...        
1543           Sureau nain
1544               Prunier
1545                Oignon
1546         Deux feuilles
1547            cymbalaire
Name: DIC_PLANTS, Length: 1548, dtype: object

Do the mapping

In [9]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in plants:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'PLANTS'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['PLANTS'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'PLANTS'] = i


Elapsed time in seconds: 161.25
Number of matches:  6880


In [10]:
most_common = classif.groupby("PLANTS").count()
most_common.STR_ESID.nlargest(10)

PLANTS
Lin        788
Rose       673
Moos       602
Birke      440
Buche      303
Ahorn      243
Eiche      217
Föhre      197
Esche      171
Flieder    152
Name: STR_ESID, dtype: int64

### Animals

In [11]:
animals = wictionary.DIC_ANIMALS.dropna()
animals

0                            Aal
1                      Aalmutter
2                       Aalrutte
3                       Aaskäfer
4                   Aberdeenrind
                  ...           
1658    Zwerg-Langzungenflughund
1659                   Zwergmaus
1660                Zwergpinguin
1661          Zwerg-Seepferdchen
1662              Zwerg-Pinscher
Name: DIC_ANIMALS, Length: 1663, dtype: object

In [12]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in animals:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'ANIMALS'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['ANIMALS'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'ANIMALS'] = i


Elapsed time in seconds: 183.640625
Number of matches:  5691


In [13]:
most_common = classif.groupby("ANIMALS").count()
most_common.STR_ESID.nlargest(10)

ANIMALS
Wal       832
Hase      373
Wolf      238
Bär       220
Fuchs     201
Sau       191
Schwan    177
Ziege     169
Vogel     148
Amsel     125
Name: STR_ESID, dtype: int64

### Professions

In [14]:
professions = wictionary.DIC_PROF.dropna()
professions

0         Abbrucharbeiter, Abbrucharbeiterin
1           Abbruchmeister, Abbruchmeisterin
2       Abbruchtechniker, Abbruchtechnikerin
3                                   Abdecker
4       Abfallbeseitiger, Abfallbeseitigerin
                        ...                 
998             Zugbegleiter, Zugbegleiterin
999                                Zugeherin
1000                               Zugehfrau
1001                              Zugehhilfe
1002      Zwangsverwalter, Zwangsverwalterin
Name: DIC_PROF, Length: 1003, dtype: object

In [15]:
professions = professions.str.replace("(,).*","", regex=True)
professions = professions.str.replace("(\().*","", regex=True)
professions = professions.str.replace("(\)).*","", regex=True)
professions

0        Abbrucharbeiter
1         Abbruchmeister
2       Abbruchtechniker
3               Abdecker
4       Abfallbeseitiger
              ...       
998         Zugbegleiter
999            Zugeherin
1000           Zugehfrau
1001          Zugehhilfe
1002     Zwangsverwalter
Name: DIC_PROF, Length: 1003, dtype: object

In [16]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in professions:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'PROF'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['PROF'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'PROF'] = i


Elapsed time in seconds: 105.859375
Number of matches:  3081


In [17]:
most_common = classif.groupby("PROF").count()
most_common.STR_ESID.nlargest(10)

PROF
Ober       1827
Schmied     128
Fischer      81
Weber        73
Jäger        72
General      61
Gärtner      46
Metzger      46
Müller       40
Bäcker       36
Name: STR_ESID, dtype: int64

### GEO

In [18]:
geo = wictionary.DIC_GEO.dropna()
geo

0                Aalenium
1                   Abbau
2      Abbaugerechtigkeit
3         Abbauwürdigkeit
4                  Abbild
              ...        
983            Zeugenberg
984               Zufluss
985        Zwillingsfluss
986               Äquator
987           Äquinoktium
Name: DIC_GEO, Length: 988, dtype: object

In [19]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in geo:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'GEO'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['PLANTS'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'GEO'] = i


Elapsed time in seconds: 129.359375
Number of matches:  6880


In [20]:
most_common = classif.groupby("GEO").count()
most_common.STR_ESID.nlargest(10)

GEO
Dorf    1080
Bahn     974
Berg     808
Bach     789
See      723
Feld     644
Wald     628
Au       453
Höhe     373
Land     360
Name: STR_ESID, dtype: int64

In [21]:
classif.head(10)

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,PLANTS,ANIMALS,PROF,GEO
0,10023770,Wiedenweg,Wieden,,,,
1,10179192,Wuhrbärgli,Wuhrbärgli,,,,
2,10140563,Emanuelenweg,Emanuelen,,,,
3,10069457,Löhrweg,Löhr,,,,
4,10096235,Brunngasse,Brunn,,,,
5,10118344,Beckengässli,Becken,,,,Becken
6,10103841,Poststrasse,Post,,,,
7,10103030,Bromhübelweg,Bromhübel,,,,
8,10108763,Zurlindenstrasse,Zurlinden,,,,
9,10209066,Bergli,Bergli,,,,Berg


## Self Classifier

In [22]:
dictionary_GEO = ('dorf', 'bahnhof', 'schul', 'haupt', 'halden', 'matt', 'bach', 'bärgli')

In [23]:
dictionary_Religion = ('kirch', 'eglise')

In [24]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dictionary_GEO:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'GEO_self'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['GEO_self'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'GEO_self'] = i


Elapsed time in seconds: 1.78125
Number of matches:  14489


In [25]:
most_common = classif.groupby("GEO_self").count()
most_common.STR_ESID.nlargest(10)

GEO_self
matt       5353
bach       2752
dorf       2375
halden     1598
schul      1114
bahnhof     741
haupt       541
bärgli       15
Name: STR_ESID, dtype: int64

## Testing similarity with spacy

In [26]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [27]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [28]:
#e.g.
doc1 = nlp("Zürich")
doc2 = nlp("Bern,")
print(doc1.similarity(doc2))

0.4916154409620511


In [29]:
#Create subset since spacy is slow for larger datasets
classifSubset = classif.iloc[:100,:]

In [30]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in classifSubset.STN_LABEL_FINAL.items():
    doc1 = nlp(i[1])
    print(i[1])
    for j in animals:
        doc2 = nlp(j)
        sim = doc1.similarity(doc2)
        classifSubset.loc[i[0], 'ANIMALS_SIM'] = sim
        phrase = 'similarity of ' + str(i) + ' with ' + str(j)+ ' is: '+ str(sim)
        if sim > 0.7:
            print(phrase)
        
        
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000) 

Wieden


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classifSubset.loc[i[0], 'ANIMALS_SIM'] = sim
  sim = doc1.similarity(doc2)


Wuhrbärgli
Emanuelen
Löhr
similarity of (3, 'Löhr') with Arni is: 0.7897494117519839
similarity of (3, 'Löhr') with Assel is: 0.8029482266815434
similarity of (3, 'Löhr') with Engerling is: 0.8327858771692043
similarity of (3, 'Löhr') with Groppe is: 0.785466379673133
similarity of (3, 'Löhr') with Olm is: 1.0000000832229166
similarity of (3, 'Löhr') with Sanderling is: 0.7039424310963767
similarity of (3, 'Löhr') with Schäfer is: 0.8478411866090583
similarity of (3, 'Löhr') with Schneider is: 0.7680610881070149
similarity of (3, 'Löhr') with Schupp is: 0.7185942204549829
similarity of (3, 'Löhr') with Schuster is: 0.7185942204549829
similarity of (3, 'Löhr') with Schwalm is: 0.8478411866090583
similarity of (3, 'Löhr') with Sichler is: 0.7680610881070149
similarity of (3, 'Löhr') with Sperling is: 0.8029482266815434
similarity of (3, 'Löhr') with Stöcker is: 0.8029482266815434
Brunn
Becken
Post
Bromhübel
Zurlinden
Bergli
Weihermatt
Schmalzgruben
Kelten
Biel
Wald
similarity of (14, 'Wa

Titterter
Buttertal
Gewerbe
Höhen
Sissacher
Linden
Freidorf
Kirch
Greifel
Badacher
Heiden
Geissmatt
Elapsed time in seconds: 1316.3125


## Classifying special street designations

In [31]:
spacy = pd.read_csv('../Datapreparation/spacy.csv', encoding='UTF-8-SIG', sep=';')

In [32]:
wiki = pd.read_csv('wiki.csv', encoding='UTF-8-SIG', sep=';')

In [33]:
mapping = pd.merge(classif, spacy, on='STR_ESID')

In [34]:
mapping = reduce(lambda left, right:     # Merge three pandas DataFrames
                     pd.merge(left , right,
                              on = ["STR_ESID"],
                              how = "outer"),
                     [streetnames, spacy, classif])

In [35]:
mapping

Unnamed: 0,Unnamed: 0_x,STR_ESID,STN_LABEL_x,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,...,SPACY_FR,SPACY_FR_LEMMA,SPACY_FR_ENT,STN_LABEL,STN_LABEL_FINAL,PLANTS,ANIMALS,PROF,GEO,GEO_self
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,...,Wieden,Wieden,,Wiedenweg,Wieden,,,,,
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,...,Wuhrbärgli,Wuhrbärgli,,Wuhrbärgli,Wuhrbärgli,,,,,bärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,...,Emanuelen,emanuelen,,Emanuelenweg,Emanuelen,,,,,
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,...,Löhr,Löhr,PER,Löhrweg,Löhr,,,,,
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,...,Brunn,Brunn,,Brunngasse,Brunn,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171867,221269,10107114,Tannenstrasse,4471,Bischofszell,TG,True,2736565.0,1261881.0,Tannenstrasse,...,Tannen,Tannen,,Tannenstrasse,Tannen,,,,,
171868,221270,10070806,Rebenstrasse,4724,Eschlikon,TG,True,2716120.0,1258548.0,Rebenstrasse,...,Reben,Reben,,Rebenstrasse,Reben,,,,,
171869,221272,10142291,Fritz-Gegauf-Strasse,4864,Steckborn,TG,True,2715726.0,1280124.0,Fritz-Gegauf-Strasse,...,Fritz Gegauf,Gegauf,PER,Fritz-Gegauf-Strasse,Fritz Gegauf,,,,,
171870,221274,10093237,Obere Bleichewiese,4571,Gachnang,TG,True,2705606.0,1266777.0,Obere Bleichewiese,...,Bleichewiese,Bleichewiese,,Obere Bleichewiese,Bleichewiese,,,,,


In [36]:
mapping.loc[mapping['SPACY_FR_ENT'].notna() , 'classif'] = mapping.SPACY_FR_ENT
mapping.loc[mapping['GEO'].notna() , 'classif'] = 'GEO'
mapping.loc[mapping['GEO_self'].notna() , 'classif'] = 'GEO'

In [37]:
mapping

Unnamed: 0,Unnamed: 0_x,STR_ESID,STN_LABEL_x,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,...,SPACY_FR_LEMMA,SPACY_FR_ENT,STN_LABEL,STN_LABEL_FINAL,PLANTS,ANIMALS,PROF,GEO,GEO_self,classif
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,...,Wieden,,Wiedenweg,Wieden,,,,,,
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,...,Wuhrbärgli,,Wuhrbärgli,Wuhrbärgli,,,,,bärgli,GEO
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,...,emanuelen,,Emanuelenweg,Emanuelen,,,,,,
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,...,Löhr,PER,Löhrweg,Löhr,,,,,,PER
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,...,Brunn,,Brunngasse,Brunn,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171867,221269,10107114,Tannenstrasse,4471,Bischofszell,TG,True,2736565.0,1261881.0,Tannenstrasse,...,Tannen,,Tannenstrasse,Tannen,,,,,,
171868,221270,10070806,Rebenstrasse,4724,Eschlikon,TG,True,2716120.0,1258548.0,Rebenstrasse,...,Reben,,Rebenstrasse,Reben,,,,,,
171869,221272,10142291,Fritz-Gegauf-Strasse,4864,Steckborn,TG,True,2715726.0,1280124.0,Fritz-Gegauf-Strasse,...,Gegauf,PER,Fritz-Gegauf-Strasse,Fritz Gegauf,,,,,,PER
171870,221274,10093237,Obere Bleichewiese,4571,Gachnang,TG,True,2705606.0,1266777.0,Obere Bleichewiese,...,Bleichewiese,,Obere Bleichewiese,Bleichewiese,,,,,,


city_dict = {
    'Paris': 'France', 
    'Toronto': 'Canada', 
    'Atlanta': 'USA'
}

df['Country'] = df['Birth City'].map(city_dict)

df['Country'] = df['Birth City'].map(city_dict).fillna('Other')

print(df)

In [38]:
classif.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Modeling\mapping.csv', encoding='UTF-8-SIG', sep=';')