# Classifying streets using dictionaries

In [1]:
#Load all needed libraries
import pandas as pd
import time
import ety
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
import numpy as np
from functools import reduce  
import warnings
import dask.dataframe as dd
warnings.filterwarnings("ignore")

## Load all input data

In [2]:
streetnames = pd.read_csv('../Datapreparation/streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()  

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,STN_LABEL_NO_TERMS,STR_TERMS,STR_PREPS,STN_LABEL_FINAL
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,Wieden,weg,,Wieden
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,Wuhrbärgli,,,Wuhrbärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,Emanuelen,weg,,Emanuelen
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,Löhr,weg,,Löhr
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,Brunn,gasse,,Brunn


In [3]:
wictionary = pd.read_csv('in_dictionary.csv',encoding='UTF-8-SIG', sep=';')
wictionary.head() 

Unnamed: 0,DIC_UID,DIC_Berufe,DIC_Lebewesen,DIC_Ortschaft,DIC_Gewässer,DIC_Berge
0,1.0,"Abbrucharbeiter, Abbrucharbeiterin",Ackergauchheil,Aeugst am Albis,Rhein,Albrun
1,2.0,"Abbruchmeister, Abbruchmeisterin",Ackerröte,Aeugstertal,Aare,Antrona
2,3.0,"Abbruchtechniker, Abbruchtechnikerin",Ackerschotendotter,Zwillikon,Rhone,Augstbord
3,4.0,Abdecker,Adelgras,Affoltern am Albis,Reuss,Balme
4,5.0,"Abfallbeseitiger, Abfallbeseitigerin",Adlerfarn,Bonstetten,Thur,Barna


## Create classification table

In [4]:
classif = streetnames

In [5]:
classif.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [6]:
classif = classif.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)
classif

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL
0,10023770,Wiedenweg,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen
3,10069457,Löhrweg,Löhr
4,10096235,Brunngasse,Brunn
...,...,...,...
172062,10107114,Tannenstrasse,Tannen
172063,10070806,Rebenstrasse,Reben
172064,10142291,Fritz-Gegauf-Strasse,Fritz Gegauf
172065,10093237,Obere Bleichewiese,Bleichewiese


In [7]:
missing_data = pd.DataFrame(
    classif.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

classif = classif.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             591


## Mapping wictionary lists
https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Tiere

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Pflanzen

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Berufe

https://www.bafu.admin.ch/bafu/de/home/themen/wasser/zustand/karten/karten-und-abgeleitete-daten/gewaessernetz-der-schweiz.html

https://de.wikipedia.org/wiki/Liste_der_P%C3%A4sse_in_der_Schweiz
https://de.wikipedia.org/wiki/Liste_von_Bergen_in_der_Schweiz

### Lebewesen

In [8]:
dic_lebewesen = wictionary.DIC_Lebewesen.dropna()

special = (['.', ' '],[',', ' '],[';', ' '],[':', ' '],['-', ' '],['\'', ' '],[',', ' '],[' ', ' '],
['è', 'e'],['é', 'e'],['ê', 'e'],['ë', 'e'],['à', 'a'],['á', 'a'],['â', 'a'],['ô', 'o'],['ò', 'o'],['ó', 'o'],
['û', 'u'],['ù', 'u'],['ú', 'u'],['ï', 'i'],['í', 'i'],['î', 'i'],['ç', 'c'])
# Not useful for matching with wikidata,['ä', 'ae'],['ü', 'ue'],['ö', 'oe']        )

for i in special:
    dic_lebewesen = dic_lebewesen.str.replace(i[0],i[1],case=False, regex=True)
    
#Delete spaces at the beginning and end of the string using function 'strip()'
dic_lebewesen = dic_lebewesen.str.strip()

dic_lebewesen = dic_lebewesen.str.replace("\ .*$","", regex=True)

In [9]:
sorted_items = sorted(dic_lebewesen.items(), key = lambda item : len(item[1]))

In [10]:
#Delete most words with three or less characters
to_del = [419, 1190,304, 781, 832, 863, 980, 1076,1124,1155,1168,1170,1233,1321,1323,1347,1427,1473,1492,1544,
          1648,1708,1709,1817,1999,2001,2331,2408,2622,2668,3150]

for i in to_del:
    del dic_lebewesen[i]

In [11]:
dic_lebewesen[:10]

0        Ackergauchheil
1             Ackerröte
2    Ackerschotendotter
3              Adelgras
4             Adlerfarn
5         Adonisröschen
6                  Agar
7                 Agave
8            Ährenlilie
9                 Ahorn
Name: DIC_Lebewesen, dtype: object

Do the mapping

In [12]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_lebewesen:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Lebewesen'] = i
  
print("Number of matches: ", classif['DIC_Lebewesen'].notna().sum())

most_common = classif.groupby("DIC_Lebewesen").count()
most_common.STR_ESID.nlargest(10)
# Following names were deleted from dic_Lebewesen: 

Number of matches:  14100
CPU times: total: 6min 11s
Wall time: 6min 29s


DIC_Lebewesen
Stein    906
Berg     814
Rose     674
Linde    641
Birke    440
Hase     373
Buche    303
Ahorn    244
Wolf     237
Bär      220
Name: STR_ESID, dtype: int64

### Municipalities

In [13]:
dic_Ortschaft = wictionary.DIC_Ortschaft.dropna()
dic_Ortschaft = dic_Ortschaft.str.replace("(\().*","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("(/).*","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("Le ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("La ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("Les ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("L'","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("St ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("St. ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("Ste ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("S. ","", regex=True)
dic_Ortschaft = dic_Ortschaft.str.replace("St-","", regex=True)


special = (['.', ' '],[',', ' '],[';', ' '],[':', ' '],['-', ' '],['\'', ' '],[',', ' '],[' ', ' '],
['è', 'e'],['é', 'e'],['ê', 'e'],['ë', 'e'],['à', 'a'],['á', 'a'],['â', 'a'],['ô', 'o'],['ò', 'o'],['ó', 'o'],
['û', 'u'],['ù', 'u'],['ú', 'u'],['ï', 'i'],['í', 'i'],['î', 'i'],['ç', 'c'])
# Not useful for matching with wikidata,['ä', 'ae'],['ü', 'ue'],['ö', 'oe']        )


for i in special:
    dic_Ortschaft = dic_Ortschaft.str.replace(i[0],i[1],case=False, regex=True)
    
#Delete spaces at the beginning and end of the string using function 'strip()'
dic_Ortschaft = dic_Ortschaft.str.strip()

dic_Ortschaft = dic_Ortschaft.str.replace("\ .*","", regex=True)

pd.set_option('display.max_rows', 5000)
#dic_Ortschaft[:300]


In [14]:
sorted_items = sorted(dic_Ortschaft.items(), key = lambda item : len(item[1]))

In [15]:
#Delete most words with three or less characters
to_del = [2178,272, 1325,1830,2051,2224,2706,3989,73, 171, 481, 681, 1056,1162,1201,1269,1307,1441,1861,1871,1932,1938,
          1956,1975,1998,2011,2018,2027,2045,2122,2165,2182,2203,2204,2226,2328,2466,2496,2574,2950,3110,3169,3190,3212,3224,3246,
          3596,3649,3663,3688,3694,3745,3785,3807,3815,3924,22,24,26,107,109,119,121,198,444,449,456,458,519,656,751,915,
         1161,1170,1436]

for i in to_del:
    del dic_Ortschaft[i]

In [16]:
dic_Ortschaft = dic_Ortschaft.unique()
dic_Ortschaft

array(['Aeugst', 'Aeugstertal', 'Zwillikon', ..., 'Gamprin', 'Ruggell',
       'Schellenberg'], dtype=object)

In [17]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Ortschaft:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Ortschaft'] = i
  
print("Number of matches: ", classif['DIC_Ortschaft'].notna().sum())

most_common = classif.groupby("DIC_Ortschaft").count()
most_common.STR_ESID.nlargest(10)

# Following names were deleted from dic_Lebewesen: 

Number of matches:  32449
CPU times: total: 7min 11s
Wall time: 7min 54s


DIC_Ortschaft
Stein       856
Berg        814
Eich        760
Linden      619
Halden      613
Buch        595
Champ       589
Wald        495
Oberdorf    494
Burg        493
Name: STR_ESID, dtype: int64

### Professions

In [18]:
dic_Berufe = wictionary.DIC_Berufe.dropna()
dic_Berufe

0          Abbrucharbeiter, Abbrucharbeiterin
1            Abbruchmeister, Abbruchmeisterin
2        Abbruchtechniker, Abbruchtechnikerin
3                                    Abdecker
4        Abfallbeseitiger, Abfallbeseitigerin
                         ...                 
25053                                 Zwirner
25054                                Zytologe
25055                       Zytologielaborant
25056                           Zytotechniker
25057               Zytotechnischer Assistent
Name: DIC_Berufe, Length: 25058, dtype: object

In [19]:
dic_Berufe = dic_Berufe.str.replace("(\().*","", regex=True)
dic_Berufe = dic_Berufe.str.replace("(\)).*","", regex=True)
dic_Berufe = dic_Berufe.str.replace(",.*","", regex=True)
#dic_Berufe = dic_Berufe.unique()
pd.set_option('display.max_rows', 5000)
dic_Berufe[:10]

0     Abbrucharbeiter
1      Abbruchmeister
2    Abbruchtechniker
3            Abdecker
4    Abfallbeseitiger
5        Abgeordneter
6           ABM-Kraft
7                 Abt
8    Abteilungsleiter
9      Achatschleifer
Name: DIC_Berufe, dtype: object

In [20]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Berufe:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Berufe'] = i
  
print("Number of matches: ", classif['DIC_Berufe'].notna().sum())

most_common = classif.groupby("DIC_Berufe").count()
most_common.STR_ESID.nlargest(10)

# Following names were deleted from dic_Lebewesen: ober, oste

Number of matches:  2830
CPU times: total: 42min 6s
Wall time: 44min 22s


DIC_Berufe
Berger     160
Senn       157
Linger     153
Käser      144
Schmied    143
Fischer     84
Weber       80
Jäger       75
Müller      64
General     61
Name: STR_ESID, dtype: int64

### Water bodies

In [21]:
dic_gewässer = wictionary.DIC_Gewässer.dropna()
dic_gewässer.head()

0    Rhein
1     Aare
2    Rhone
3    Reuss
4     Thur
Name: DIC_Gewässer, dtype: object

In [22]:
dic_gewässer_self = ('^see','see$','seeli', '^lac$', 'lago', '^lai$ ', 'bach$', 'weiher', 'wasser','^leman', 
                     'dranse' ,'^rives', 'ufer$','ruisseau', 'gorges', 'bächli')

In [23]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_gewässer_self:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Gewässer'] = i

for i in dic_gewässer:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Gewässer'] = i
  
print("Number of matches: ", classif['DIC_Gewässer'].notna().sum())

most_common = classif.groupby("DIC_Gewässer").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  4916
CPU times: total: 24.5 s
Wall time: 30.3 s


DIC_Gewässer
bach$     2030
^see       488
weiher     374
see$       353
Rhein      206
bächli     156
wasser     129
Thur        94
Aare        88
^lac$       79
Name: STR_ESID, dtype: int64

### Mountains

In [24]:
dic_berg = wictionary.DIC_Berge.dropna()
dic_berg = dic_berg.str.replace("Piz ","", regex=True)
dic_berg = dic_berg.str.replace("Pizzo ","", regex=True)
# Following names were deleted from dic_Lebewesen: Cou, Dom, Lys, Croix

In [25]:
sorted_items = sorted(dic_berg.items(), key = lambda item : len(item[1]))

In [26]:
#Delete most words with three or less characters
to_del = [26,166,366,49,64]

for i in to_del:
    del dic_berg[i]
    
dic_berg[:10]

0                     Albrun
1                    Antrona
2                  Augstbord
3                      Balme
4                      Barna
5    Bocchetta di Val Maggia
6                      Boeuf
7              Bunderchrinde
8                   Carmenna
9                    Carnusa
Name: DIC_Berge, dtype: object

In [27]:
dic_berg_self = ('berg$', 'piz ', ' mont', 'pass$', 'höhe', 'monte ', 'gletscher', 'glacier', 'alpen', 'jura',
                'alpes')

In [28]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_berg_self:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Berg'] = i

for i in dic_berg:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Berg'] = i
  
print("Number of matches: ", classif['DIC_Berg'].notna().sum())

most_common = classif.groupby("DIC_Berg").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  7433
CPU times: total: 55.4 s
Wall time: 59.1 s


DIC_Berg
berg$       3747
höhe         502
jura         274
alpen        225
Rigi         139
Säntis       122
 mont        105
Pilatus       89
Bachtel       80
Planches      67
Name: STR_ESID, dtype: int64

## Self Classifier

### Settlement names

In [29]:
dic_Siedlung = ('dorf', 'bahnhof', 'schul', 'haupt',  'antenne', 'bahn', 'dörfli', 'hof$',
                'flughafen',  'ort', 'quartier', 'tunnel', 'brücke', 'wehr','garten', 'kaserne',
                'mühle', 'post', 'industrie',  'hof ',  'schloss', 'spital', 'alp ', 'kanal', 'casino',
                'burg', 'damm', 'reservoir','gewerbe', 'park', 'auto', 'treppe', 'unterführ', 'museum',
                'haus','turm', 'mauer', 'stadion', 'sport', 'fabrik', 'scheibenstand', 'schleuse',
                'turbine', 'antenne', 'camping', '^golf','schwimmbad', '^badi$', 'zoo', '^flug', 'werk', 'markt',
                'village', 'gare', 'ecole', 'lieu', 'gare', 'musee',
                'aero', '^pont', 'pont$', 'jardin', 'chateau','industrie', 'parc',
                'moulin', 'college', '^digue', 'barrage', 'escalier', 'passerelle','^tour','tir ',
                '^stade$', 'ciblerie', '^port$', 'ecluse','^perron', 'piscine', 'centrale', 'hopital',
                'ville$', 'ferme', '^bourg', 'stand', 'canal',
                'villaggio', 'stazione', 'scola', 'ferrovia')

In [30]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Siedlung:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Siedlung'] = i
  
print("Number of matches: ", classif['DIC_Siedlung'].notna().sum())

most_common = classif.groupby("DIC_Siedlung").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  19780
CPU times: total: 11.9 s
Wall time: 12.5 s


DIC_Siedlung
hof$      2875
dorf      2450
haus      1501
garten    1443
mühle     1034
burg       893
schul      723
ort        548
haupt      536
post       524
Name: STR_ESID, dtype: int64

### Religious names

In [31]:
dic_Religion = ('kirch', 'eglise', 'kapelle$', 'sacre', 'kloster', 'friedhof', 'kreuz'
               ,'chapelle', 'croix', 'chilch', 'pfarr','himmel', 'jerusalem', 'couvent')

In [32]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_Religion:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Religion'] = i
  
print("Number of matches: ", classif['DIC_Religion'].notna().sum())

most_common = classif.groupby("DIC_Religion").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  2788
CPU times: total: 1.8 s
Wall time: 1.82 s


DIC_Religion
kirch       1237
kreuz        382
eglise       252
friedhof     207
croix        168
chapelle     144
kloster      137
pfarr         96
chilch        94
himmel        52
Name: STR_ESID, dtype: int64

### Names describing natural places

In [33]:
dic_natur =( 'grabe','halden', 'matt', 'bärgli','tal$', 'tobel', 'loch$', 'egga', 'egg$', 'egge$,','^eggen', 'wald', 
            'weid', 'rain', 'grund', 'büel', 'täli', 'tälli', 'grat$', 'laui','^furt$', 'boden', 'klus', 'holz', 'feld',
            'zelgli', 'allmend', 'land$','ried$', 'bühl','riet', 'acker','wiese', 'moos', 'rüti',
            'plateau', 'combe', 'vallon','valle$', 'creux', 'cresta', 'crete','^bois',
            '^val ', 'vallone','^val$')

In [34]:
%%time 

warnings.filterwarnings("ignore")

for i in dic_natur:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Natur'] = i
  
print("Number of matches: ", classif['DIC_Natur'].notna().sum())

most_common = classif.groupby("DIC_Natur").count()
most_common.STR_ESID.nlargest(10)

Number of matches:  31245
CPU times: total: 6.03 s
Wall time: 6.2 s


DIC_Natur
matt      5138
acker     2994
rain      2393
feld      1834
holz      1612
weid      1509
halden    1504
wald      1485
moos      1226
rüti      1098
Name: STR_ESID, dtype: int64

## Testing similarity with spacy

In [35]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [36]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [37]:
#e.g.
doc1 = nlp("Zürich")
doc2 = nlp("Bern,")
print(doc1.similarity(doc2))

0.4916154409620511


In [38]:
#Create subset since spacy is slow for larger datasets
classifSubset = classif.iloc[:30,:]

In [39]:
%%time 


for i in classifSubset.STN_LABEL_FINAL.items():
    doc1 = nlp(i[1])
    print(i[1])
    for j in dic_Siedlung:
        doc2 = nlp(j)
        sim = doc1.similarity(doc2)
        classifSubset.loc[i[0], 'SIEDLUNG_SIM'] = sim
        phrase = 'similarity of ' + str(i) + ' with ' + str(j)+ ' is: '+ str(sim)
        if sim > 0.7:
            print(phrase)

Wieden
Wuhrbärgli
Emanuelen
Löhr
Brunn
Becken
Post
Bromhübel
Zurlinden
Bergli
Weihermatt
Schmalzgruben
Kelten
Biel
Wald
Neu
Vogesen
Wolfgalgenstutz
Anger
Chilch
Anton von Blarer
Golchen
Madlenjäger
Strengelersacker
Rütirain
Oris
Fuchsacker
Stockbrunnenrain
Langenhag
Weid
CPU times: total: 19.5 s
Wall time: 20.6 s


In [40]:
classif.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Modeling\out_dictionary.csv', encoding='UTF-8-SIG', sep=';')