# Classifying streets using dictionaries

In [34]:
#Load all needed libraries
import pandas as pd
import time
import ety
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
import numpy as np
from functools import reduce  
import warnings
import dask.dataframe as dd
warnings.filterwarnings("ignore")

## Load all input data

In [2]:
streetnames = pd.read_csv('../Datapreparation/streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()  

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,STN_LABEL_NO_TERMS,STR_TERMS,STR_PREPS,STN_LABEL_FINAL
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,Wieden,weg,,Wieden
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,Wuhrbärgli,,,Wuhrbärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,Emanuelen,weg,,Emanuelen
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,Löhr,weg,,Löhr
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,Brunn,gasse,,Brunn


In [3]:
wictionary = pd.read_csv('in_dictionary.csv',encoding='UTF-8-SIG', sep=';')
wictionary.head() 

Unnamed: 0,DIC_UID,DIC_Berufe,DIC_Lebewesen,DIC_Ortschaft,DIC_Gewässer,DIC_Berge
0,1.0,"Abbrucharbeiter, Abbrucharbeiterin",Ackergauchheil,Aeugst am Albis,Rhein,Albrun
1,2.0,"Abbruchmeister, Abbruchmeisterin",Ackerröte,Aeugstertal,Aare,Antrona
2,3.0,"Abbruchtechniker, Abbruchtechnikerin",Ackerschotendotter,Zwillikon,Rhone,Augstbord
3,4.0,Abdecker,Adelgras,Affoltern am Albis,Reuss,Balme
4,5.0,"Abfallbeseitiger, Abfallbeseitigerin",Adlerfarn,Bonstetten,Thur,Barna


## Create classification table

In [4]:
classif = streetnames

In [5]:
classif.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [6]:
classif = classif.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)
classif

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL
0,10023770,Wiedenweg,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen
3,10069457,Löhrweg,Löhr
4,10096235,Brunngasse,Brunn
...,...,...,...
171881,10107114,Tannenstrasse,Tannen
171882,10070806,Rebenstrasse,Reben
171883,10142291,Fritz-Gegauf-Strasse,Fritz Gegauf
171884,10093237,Obere Bleichewiese,Bleichewiese


In [7]:
missing_data = pd.DataFrame(
    classif.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

classif = classif.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             443


## Mapping wictionary lists
https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Tiere

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Pflanzen

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Berufe

https://www.bafu.admin.ch/bafu/de/home/themen/wasser/zustand/karten/karten-und-abgeleitete-daten/gewaessernetz-der-schweiz.html

https://de.wikipedia.org/wiki/Liste_der_P%C3%A4sse_in_der_Schweiz
https://de.wikipedia.org/wiki/Liste_von_Bergen_in_der_Schweiz

### Lebewesen

In [8]:
dic_lebewesen = wictionary.DIC_Lebewesen.dropna()
dic_lebewesen

0           Ackergauchheil
1                Ackerröte
2       Ackerschotendotter
3                 Adelgras
4                Adlerfarn
               ...        
3192    Zwerg-Seepferdchen
3193        Zwerg-Pinscher
3194                Cedres
3195              Cericier
3196                 Reben
Name: DIC_Lebewesen, Length: 3197, dtype: object

Do the mapping

In [9]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_lebewesen:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Lebewesen'] = i
  
print("Number of matches: ", classif['DIC_Lebewesen'].notna().sum())

most_common = classif.groupby("DIC_Lebewesen").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  10335
CPU times: total: 4min 26s
Wall time: 4min 32s


DIC_Lebewesen
Rose     673
Linde    641
Birke    440
Hase     373
Buche    303
Ahorn    244
Wolf     238
Bär      220
Eiche    216
Reben    210
Name: STR_ESID, dtype: int64

In [10]:
most_common = classif.groupby("DIC_Lebewesen").count()
pd.set_option('display.max_rows', 100)
most_common.STR_ESID.nlargest(10)
# Following names were deleted from dic_Lebewesen: 

DIC_Lebewesen
Rose     673
Linde    641
Birke    440
Hase     373
Buche    303
Ahorn    244
Wolf     238
Bär      220
Eiche    216
Reben    210
Name: STR_ESID, dtype: int64

### Municipalities

In [11]:
dic_Ortschaft = wictionary.DIC_Ortschaft.dropna()
dic_Ortschaft = dic_Ortschaft.str.replace("(\().*","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("(/).*","", regex=True)
dic_Ortschaft = dic_Ortschaft.unique()
pd.set_option('display.max_rows', 5000)
dic_Ortschaft
#dic_Ortschaft[:300]


array(['Aeugst am Albis', 'Aeugstertal', 'Zwillikon', ...,
       'Gamprin-Bendern', 'Ruggell', 'Schellenberg'], dtype=object)

In [12]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Ortschaft:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Ortschaft'] = i
  
print("Number of matches: ", classif['DIC_Ortschaft'].notna().sum())

most_common = classif.groupby("DIC_Ortschaft").count()
most_common.STR_ESID.nlargest(10)

# Following names were deleted from dic_Lebewesen: 

Number of matches:  19635
CPU times: total: 6min 4s
Wall time: 6min 15s


DIC_Ortschaft
Dorf      1080
Eich       759
Cham       655
Linden     619
Halden     613
Matt       531
Mon        522
Rain       449
Tann       448
Gross      388
Name: STR_ESID, dtype: int64

### Professions

In [13]:
dic_Berufe = wictionary.DIC_Berufe.dropna()
dic_Berufe

0          Abbrucharbeiter, Abbrucharbeiterin
1            Abbruchmeister, Abbruchmeisterin
2        Abbruchtechniker, Abbruchtechnikerin
3                                    Abdecker
4        Abfallbeseitiger, Abfallbeseitigerin
                         ...                 
25053                                 Zwirner
25054                                Zytologe
25055                       Zytologielaborant
25056                           Zytotechniker
25057               Zytotechnischer Assistent
Name: DIC_Berufe, Length: 25058, dtype: object

In [14]:
dic_Berufe = dic_Berufe.str.replace("(\().*","", regex=True)
dic_Berufe = dic_Berufe.str.replace("(\)).*","", regex=True)
dic_Berufe = dic_Berufe.str.replace(",.*","", regex=True)
dic_Berufe = dic_Berufe.unique()
pd.set_option('display.max_rows', 5000)
dic_Berufe
#dic_Berufe[:200]

array(['Abbrucharbeiter', 'Abbruchmeister', 'Abbruchtechniker', ...,
       'Zytologielaborant', 'Zytotechniker', 'Zytotechnischer Assistent'],
      dtype=object)

In [15]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Berufe:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Berufe'] = i
  
print("Number of matches: ", classif['DIC_Berufe'].notna().sum())

most_common = classif.groupby("DIC_Berufe").count()
most_common.STR_ESID.nlargest(10)

# Following names were deleted from dic_Lebewesen: ober, oste

Number of matches:  2831
CPU times: total: 25min 57s
Wall time: 26min 19s


DIC_Berufe
Berger     163
Senn       157
Linger     153
Käser      144
Schmied    141
Fischer     84
Weber       80
Jäger       75
Müller      64
General     61
Name: STR_ESID, dtype: int64

### Water bodies

In [16]:
dic_gewässer = wictionary.DIC_Gewässer.dropna()
dic_gewässer.head()

0    Rhein
1     Aare
2    Rhone
3    Reuss
4     Thur
Name: DIC_Gewässer, dtype: object

In [17]:
dic_gewässer_self = ('^see','see$','seeli', '^lac$', 'lago', '^lai$ ', 'bach$', 'weiher', 'wasser','^leman', 
                     'dranse' ,'^rives', 'ufer$','ruisseau', 'gorges', 'bächli')

In [18]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_gewässer_self:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Gewässer'] = i

for i in dic_gewässer:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Gewässer'] = i
  
print("Number of matches: ", classif['DIC_Gewässer'].notna().sum())

most_common = classif.groupby("DIC_Gewässer").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  4907
CPU times: total: 14.9 s
Wall time: 15.2 s


DIC_Gewässer
bach$     2024
^see       491
weiher     374
see$       350
Rhein      206
bächli     156
wasser     129
Thur        94
Aare        88
^lac$       79
Name: STR_ESID, dtype: int64

### Mountains

In [19]:
dic_berg = wictionary.DIC_Berge.dropna()
dic_berg

# Following names were deleted from dic_Lebewesen: Cou, Dom, Lys, Croix

0                       Albrun
1                      Antrona
2                    Augstbord
3                        Balme
4                        Barna
5      Bocchetta di Val Maggia
6                        Boeuf
7                Bunderchrinde
8                     Carmenna
9                      Carnusa
10                     Casanna
11                  Chaschauna
12                     Chésery
13                    Cheville
14               Chinzig Chulm
15                   Chräzeren
16                     Chrüzli
17                       Corno
18                 Cristallina
19                     Diesrut
20                      Durand
21                     Duranna
22                       Encel
23                    Euschels
24                Ferret Grand
25                Ferret Petit
26                         Foo
27                     Fórcola
28              Fuorcla Surlej
29          Fuorcla da Patnaul
30                   Furcletta
31                    Futschöl
32      

In [20]:
dic_berg_self = ('berg$', 'piz ', ' mont', 'pass$', 'höhe', 'monte ', 'gletscher', 'glacier', 'alpen')

In [21]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_berg_self:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Berg'] = i

for i in dic_berg:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Berg'] = i
  
print("Number of matches: ", classif['DIC_Berg'].notna().sum())

most_common = classif.groupby("DIC_Berg").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  7086
CPU times: total: 38.6 s
Wall time: 39.4 s


DIC_Berg
berg$       3745
höhe         502
alpen        225
Rigi         139
Säntis       122
 mont        105
Pilatus       89
Bachtel       80
Planches      67
Speer         66
Name: STR_ESID, dtype: int64

## Self Classifier

### Settlement names

In [22]:
dic_Siedlung = ('dorf', 'bahnhof', 'schul', 'haupt',  'antenne', 'bahn', 'dörfli', 'hof$',
                'flughafen',  'ort', 'quartier', 'tunnel', 'brücke', 'wehr','garten',
                'mühle', 'post', 'industrie',  'hof ',  'schloss', 'spital', 'alp ', 'kanal',
                'burg', 'damm', 'reservoir','gewerbe', 'park', 'auto', 'treppe', 'unterführ',
                'haus','turm', 'mauer', 'stadion', 'sport', 'fabrik', 'scheibenstand', 'schleuse',
                'turbine', 'antenne', 'camping', '^golf','schwimmbad', '^badi$', 'zoo', '^flug', 'werk',
                'village', 'gare', 'ecole', 'lieu', 'gare',
                'aero', '^pont', 'pont$', 'jardin', 'chateau','industrie', 'parc',
                'moulin', 'college', '^digue', 'barrage', 'escalier', 'passerelle','^tour','tir ',
                '^stade$', 'ciblerie', '^port$', 'ecluse','^perron', 'piscine', 'centrale', 'hopital',
                'ville$', 'ferme', '^bourg', 'stand', 'canal',
                'villaggio', 'stazione', 'scola', 'ferrovia')

In [23]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Siedlung:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Siedlung'] = i
  
print("Number of matches: ", classif['DIC_Siedlung'].notna().sum())

most_common = classif.groupby("DIC_Siedlung").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  19580
CPU times: total: 9.45 s
Wall time: 9.71 s


DIC_Siedlung
hof$      2865
dorf      2450
haus      1500
garten    1443
mühle     1034
burg       893
schul      723
ort        550
haupt      536
post       523
Name: STR_ESID, dtype: int64

### Religious names

In [24]:
dic_Religion = ('kirch', 'eglise', 'kapelle$', 'sacre', 'kloster', 'friedhof', 'kreuz'
               ,'chapelle', 'croix', 'chilch', 'pfarr','himmel', 'jerusalem', 'couvent')

In [25]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Religion:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Religion'] = i
  
print("Number of matches: ", classif['DIC_Religion'].notna().sum())

most_common = classif.groupby("DIC_Religion").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  2788
CPU times: total: 1.91 s
Wall time: 1.97 s


DIC_Religion
kirch       1237
kreuz        382
eglise       252
friedhof     207
croix        168
chapelle     144
kloster      137
pfarr         96
chilch        94
himmel        52
Name: STR_ESID, dtype: int64

### Names describing natural places

In [26]:
dic_natur =( 'grabe','halden', 'matt', 'bärgli','tal$', 'tobel', 'loch$', 'egga', 'egg$', 'egge$,','^eggen', 'wald', 
            'weid', 'rain', 'grund', 'büel', 'täli', 'tälli', 'grat$', 'laui','^furt$', 'boden', 'klus', 'holz', 'feld',
            'zelgli', 'allmend', 'land$','ried$', 'bühl','riet', 'acker','wiese', 'moos', 'rüti',
            'plateau', 'combe', 'vallon','valle$', 'creux', 'cresta', 'crete','^bois',
            '^val ', 'vallone','^val$')

In [27]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_natur:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Natur'] = i
  
print("Number of matches: ", classif['DIC_Natur'].notna().sum())

most_common = classif.groupby("DIC_Natur").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  31243
CPU times: total: 5.06 s
Wall time: 5.13 s


DIC_Natur
matt      5138
acker     2994
rain      2393
feld      1834
holz      1612
weid      1511
halden    1504
wald      1485
moos      1226
rüti      1098
Name: STR_ESID, dtype: int64

## Testing similarity with spacy

In [28]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [29]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [30]:
#e.g.
doc1 = nlp("Zürich")
doc2 = nlp("Bern,")
print(doc1.similarity(doc2))

0.4916154409620511


In [31]:
#Create subset since spacy is slow for larger datasets
classifSubset = classif.iloc[:500,:]

In [32]:
%%time 

# Start the stopwatch / counter

for i in classifSubset.STN_LABEL_FINAL.items():
    doc1 = nlp(i[1])
    print(i[1])
    for j in dic_Religion:
        doc2 = nlp(j)
        sim = doc1.similarity(doc2)
        classifSubset.loc[i[0], 'ANIMALS_SIM'] = sim
        phrase = 'similarity of ' + str(i) + ' with ' + str(j)+ ' is: '+ str(sim)
        #if sim > 0.7:
        #    print(phrase)

Wieden
Wuhrbärgli
Emanuelen
Löhr
Brunn
Becken
Post
Bromhübel
Zurlinden
Bergli
Weihermatt
Schmalzgruben
Kelten
Biel
Wald
Neu
Vogesen
Wolfgalgenstutz
Anger
Chilch
Anton von Blarer
Golchen
Madlenjäger
Strengelersacker
Rütirain
Oris
Fuchsacker
Stockbrunnenrain
Langenhag
Weid
Meisen
Holder
Ringlichen
Fuchsmatt
Elbis
Buech
Chilchhöfli
Bruderholz
Gempen
Laimatt
Tal
Haupt
Etzmatten
Graben
Aesch Bannacker
Reben
Bündtenhag
Bieglin
Bromberg
Hohle
Zunftacker
Batterie
Buchen
Bahnhof
Lärchen Häuli
Bolzenried
Gehren
Seeber
Bärenbrunnen
Gempen
Paradies
Spittellache
Herrenmatt
Prediger
Beunden
Schul
Chläberen
Vogtsmatten
Langgarben
Oberkleinfeld
Neu
Bois Pouche
Stallen
Fehlmann
Waldenburger
Gärten
Leimgruben
Schibenstand
Baumgarten
Industrie
Nenzlinger
Matten
Bogen
Bodenacker
Hohl
Chrummischleifi
Römer
General Guisan
Titterter
Buttertal
Gewerbe
Höhen
Sissacher
Linden
Freidorf
Kirch
Greifel
Badacher
Heiden
Geissmatt
Schul
Rain
Lee
Häfelfinger
Garten
Zündhollen
Schwirten
Rebacker
Gewerbe
Bünnen
Mooshag
B

In [33]:
classif.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Modeling\out_dictionary.csv', encoding='UTF-8-SIG', sep=';')