# Classifying streets using dictionaries

In [1]:
#Load all needed libraries
import pandas as pd
from time import process_time_ns
import ety
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
import numpy as np
from functools import reduce  

  from .autonotebook import tqdm as notebook_tqdm


## Load all input data

In [2]:
streetnames = pd.read_csv('../Datapreparation/streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.head()  

Unnamed: 0.1,Unnamed: 0,STR_ESID,STN_LABEL,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,STN_LABEL_NO_TERMS,STR_TERMS,STR_PREPS,STN_LABEL_FINAL
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,Wieden,weg,,Wieden
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,Wuhrbärgli,,,Wuhrbärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,Emanuelen,weg,,Emanuelen
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,Löhr,weg,,Löhr
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,Brunn,gasse,,Brunn


In [3]:
wictionary = pd.read_csv('in_dictionary.csv',encoding='UTF-8-SIG', sep=';')
wictionary.head() 

Unnamed: 0,DIC_UID,DIC_Berufe,DIC_Lebewesen,DIC_Ortschaft
0,1.0,"Abbrucharbeiter, Abbrucharbeiterin",Ackergauchheil,Aeugst am Albis
1,2.0,"Abbruchmeister, Abbruchmeisterin",Ackerröte,Aeugstertal
2,3.0,"Abbruchtechniker, Abbruchtechnikerin",Ackerschotendotter,Zwillikon
3,4.0,Abdecker,Adelgras,Affoltern am Albis
4,5.0,"Abfallbeseitiger, Abfallbeseitigerin",Adlerfarn,Bonstetten


## Create classification table

In [4]:
classif = streetnames

In [5]:
classif.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [6]:
classif = classif.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)
classif

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL
0,10023770,Wiedenweg,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen
3,10069457,Löhrweg,Löhr
4,10096235,Brunngasse,Brunn
...,...,...,...
171867,10107114,Tannenstrasse,Tannen
171868,10070806,Rebenstrasse,Reben
171869,10142291,Fritz-Gegauf-Strasse,Fritz Gegauf
171870,10093237,Obere Bleichewiese,Bleichewiese


In [7]:
missing_data = pd.DataFrame(
    classif.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

classif = classif.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             415


## Mapping wictionary lists
https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Tiere

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Pflanzen

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Berufe

https://de.wiktionary.org/wiki/Verzeichnis:Deutsch/Geowissenschaften

### Lebewesen

In [8]:
dic_lebewesen = wictionary.DIC_Lebewesen.dropna()
dic_lebewesen

0                 Ackergauchheil
1                      Ackerröte
2             Ackerschotendotter
3                       Adelgras
4                      Adlerfarn
                  ...           
3190    Zwerg-Langzungenflughund
3191                   Zwergmaus
3192                Zwergpinguin
3193          Zwerg-Seepferdchen
3194              Zwerg-Pinscher
Name: DIC_Lebewesen, Length: 3195, dtype: object

Do the mapping

In [9]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_lebewesen:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Lebewesen'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Lebewesen'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Lebewesen'] = i


Elapsed time in seconds: 291.21875
Number of matches:  10696


In [10]:
most_common = classif.groupby("DIC_Lebewesen").count()
pd.set_option('display.max_rows', 100)
most_common.STR_ESID.nlargest(50)
# Following names were deleted from dic_Lebewesen: Jonc, Hai, Chin, Wal, Ur, Ren, 
#Jak, Hai, Ara, Muli, Lin, Ai, Lar, Gui, Sau, Laus

DIC_Lebewesen
Rose        673
Linde       641
Moos        602
Birke       440
Hase        373
Buche       303
Ahorn       243
Wolf        238
Bär         220
Eiche       216
Fuchs       201
Föhre       197
Schwan      177
Esche       171
Ziege       169
Flieder     152
Lärche      149
Vogel       148
Nelke       137
Tulpe       132
Amsel       125
Hirsch      124
Meise       108
Schaf       107
Löwe         92
Reh          91
Fichte       90
Meier        86
Tilleul      85
Weide        75
Nuss         74
Mais         72
Farn         71
Ulme         70
Ochse        67
Roggen       67
Adler        65
Dachs        65
Dahlie       65
Lilie        64
Pappel       61
Sauge        60
Katze        59
Storch       56
Rind         55
Stier        53
Drossel      52
Schwalbe     51
Biber        47
Winde        46
Name: STR_ESID, dtype: int64

### Municipalities

In [11]:
dic_Ortschaft = wictionary.DIC_Ortschaft.dropna()
dic_Ortschaft

0          Aeugst am Albis
1              Aeugstertal
2                Zwillikon
3       Affoltern am Albis
4               Bonstetten
               ...        
4118            Schaanwald
4119             Mauren FL
4120       Gamprin-Bendern
4121               Ruggell
4122          Schellenberg
Name: DIC_Ortschaft, Length: 4123, dtype: object

In [12]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_Ortschaft:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Ortschaft'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Ortschaft'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Ortschaft'] = i


Elapsed time in seconds: 357.0
Number of matches:  19091


In [13]:
most_common = classif.groupby("DIC_Ortschaft").count()
most_common.STR_ESID.nlargest(50)

DIC_Ortschaft
Dorf        1080
Eich         759
Cham         656
Linden       619
Halden       613
Matt         531
Mon          522
Rain         449
Tann         448
Gross        388
Winkel       365
Erlen        359
Sent         304
Font         292
Alle         284
Loc          257
Geiss        253
Brunnen      251
Rue          245
Bern         242
Tobel        240
Graben       209
Eschen       159
Strada       142
Port         135
Sur          129
Gruben       126
Lü           121
Bellevue     117
Monte        114
Hütten       109
Mettlen      106
Ebnet        105
Horn         105
Boll         102
Widen         99
Asp           97
Steinen       88
Halten        80
Contra        78
Binz          74
Court         69
Basel         67
Hofstatt      64
Luzern        64
Scheid        63
Heiden        57
Travers       57
Lausanne      48
Alten         47
Name: STR_ESID, dtype: int64

### Professions

In [14]:
dic_Berufe = wictionary.DIC_Berufe.dropna()
dic_Berufe

0          Abbrucharbeiter, Abbrucharbeiterin
1            Abbruchmeister, Abbruchmeisterin
2        Abbruchtechniker, Abbruchtechnikerin
3                                    Abdecker
4        Abfallbeseitiger, Abfallbeseitigerin
                         ...                 
25056                                 Zwirner
25057                                Zytologe
25058                       Zytologielaborant
25059                           Zytotechniker
25060               Zytotechnischer Assistent
Name: DIC_Berufe, Length: 25061, dtype: object

In [15]:
dic_Berufe = dic_Berufe.str.replace("(\().*","", regex=True)
dic_Berufe = dic_Berufe.str.replace("(\)).*","", regex=True)
dic_Berufe = dic_Berufe.str.replace(",.*","", regex=True)
dic_Berufe = dic_Berufe.unique()
pd.set_option('display.max_rows', 5000)
dic_Berufe

array(['Abbrucharbeiter', 'Abbruchmeister', 'Abbruchtechniker', ...,
       'Zytologielaborant', 'Zytotechniker', 'Zytotechnischer Assistent'],
      dtype=object)

In [16]:
dic_Berufe[:200]

array(['Abbrucharbeiter', 'Abbruchmeister', 'Abbruchtechniker',
       'Abdecker', 'Abfallbeseitiger', 'Abgeordneter', 'ABM-Kraft', 'Abt',
       'Abteilungsleiter', 'Achatschleifer', 'Adjutant', 'Administrator',
       'Admiral', 'Änderungsschneider', 'Agent', 'Akrobat',
       'Altenpflegehelfer', 'Altenpfleger',
       'Anästhesietechnischer Assistent', 'Angiologe', 'Angler',
       'Animateur', 'Animierdame', 'Anlageberater', 'Anlagenberater',
       'Anlagenmechaniker', 'Anlagenmechaniker für Sanitär-', 'Ansager',
       'Anstreicher', 'Anwalt', 'Anwaltsgehilfe', 'Apothekenhelfer',
       'Apparatebauer', 'Arbeiter', 'Architekt', 'Archivar',
       'Archivassistent', 'Arrangeur', 'Artist', 'Arzt', 'Arzthelfer',
       'Asphaltbauer', 'Assistent',
       'Assistent für Automatisierungs- und Computertechnik',
       'Assistent für den Gesundheitstourismus',
       'Assistent für Freizeitwirtschaft',
       'Assistent für Geovisualisierung', 'Assistent für Hotelmanagement',
       'A

In [17]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_Berufe:
    classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Berufe'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Berufe'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.match(i,case=False), 'DIC_Berufe'] = i


Elapsed time in seconds: 2194.453125
Number of matches:  3933


In [18]:
most_common = classif.groupby("DIC_Berufe").count()
most_common.STR_ESID.nlargest(30)

DIC_Berufe
Ober         1776
Senn          146
Käser         140
Schmied       128
Fischer        81
Weber          73
Jäger          72
General        61
Oberst         51
Seiler         49
Gärtner        46
Metzger        46
Berger         44
Müller         40
Bäcker         36
Brauer         35
Carrier        35
Gerber         34
Tuilier        30
Küfer          27
Winzer         27
Mönch          26
Schlosser      25
Spinner        25
Devin          23
Hafner         22
Wagner         22
Hirt           20
Oste           20
Schäfer        19
Name: STR_ESID, dtype: int64

## Self Classifier

In [19]:
dic_Siedlung = ('dorf', 'bahnhof', 'schul', 'haupt',  'antenne', 'bahn', 
                'flughafen',  'ort', 'quartier', 'tunnel', 'brücke', 'wehr','garten',
                'mühle', 'post', 'industrie',  'hof ',  'schloss',
                'burg', 'damm', 'reservoir','gewerbe', 'park',
                'village', 'gare', 'ecole', 'lieu', 'gare',
                'aéroport', 'pont', 'jardin', 'chateau','industrie', 'parc',
                'moulin', 'college',
                'villaggio', 'stazione', 'scola', 'ferrovia')

In [20]:
dic_Religion = ('kirch', 'eglise', 'kapelle', 'sakral', 'kloster', 'friedhof', 'kreuz'
               ,'chapelle', 'croix', 'chilch')

In [21]:
dic_Natur =( 'grabe','halden', 'matt', 'bach', 'bärgli','tal', 'tobel', 'gorges', 
           'combe', 'loch', 'egg', 'wald', 'weid', 'rain', 'sonne', 'grund', 'büel'
           'zelgli', 'allmend', 'land ','ried', 'bühl','riet', 'acker')

In [22]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_Siedlung:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Siedlung'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Siedlung'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Siedlung'] = i


Elapsed time in seconds: 3.921875
Number of matches:  14097


In [23]:
most_common = classif.groupby("DIC_Siedlung").count()
most_common.STR_ESID.nlargest(10)

DIC_Siedlung
dorf       2454
garten     1444
schul      1110
bahn       1088
mühle      1040
burg        897
ort         787
haupt       539
post        523
schloss     423
Name: STR_ESID, dtype: int64

In [24]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_Religion:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Religion'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Religion'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Religion'] = i


Elapsed time in seconds: 0.90625
Number of matches:  2726


In [25]:
most_common = classif.groupby("DIC_Religion").count()
most_common.STR_ESID.nlargest(10)

DIC_Religion
kirch       1238
kreuz        382
eglise       252
friedhof     207
croix        168
chapelle     144
kloster      137
kapelle      104
chilch        94
Name: STR_ESID, dtype: int64

In [26]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in dic_Natur:
    classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Natur'] = i
    
# Stop the stopwatch / counter
t1_stop = process_time_ns()
   
print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000)

print("Number of matches: ", classif['DIC_Natur'].notna().sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.loc[classif.STN_LABEL_FINAL.str.contains(i,case=False), 'DIC_Natur'] = i


Elapsed time in seconds: 2.5
Number of matches:  27183


In [27]:
most_common = classif.groupby("DIC_Natur").count()
most_common.STR_ESID.nlargest(10)

DIC_Natur
matt      5104
acker     3036
bach      2543
rain      2341
egg       1721
tal       1604
weid      1576
wald      1549
halden    1488
ried      1148
Name: STR_ESID, dtype: int64

## Testing similarity with spacy

In [28]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [29]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [30]:
#e.g.
doc1 = nlp("Zürich")
doc2 = nlp("Bern,")
print(doc1.similarity(doc2))

0.4916154409620511


In [31]:
#Create subset since spacy is slow for larger datasets
classifSubset = classif.iloc[:500,:]

In [32]:
# Start the stopwatch / counter
t1_start = process_time_ns()

for i in classifSubset.STN_LABEL_FINAL.items():
    doc1 = nlp(i[1])
    print(i[1])
    for j in dic_Religion:
        doc2 = nlp(j)
        sim = doc1.similarity(doc2)
        classifSubset.loc[i[0], 'ANIMALS_SIM'] = sim
        phrase = 'similarity of ' + str(i) + ' with ' + str(j)+ ' is: '+ str(sim)
        if sim > 0.7:
            print(phrase)
        
        
# Stop the stopwatch / counter
t1_stop = process_time_ns()

print("Elapsed time in seconds:", (t1_stop-t1_start)/1000000000) 

Wieden
Wuhrbärgli
Emanuelen


  sim = doc1.similarity(doc2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classifSubset.loc[i[0], 'ANIMALS_SIM'] = sim
  sim = doc1.similarity(doc2)


Löhr
Brunn
Becken
Post
Bromhübel
Zurlinden
Bergli
Weihermatt
Schmalzgruben
Kelten
Biel
Wald
Neu
Vogesen
Wolfgalgenstutz
Anger
Chilch
Anton von Blarer
Golchen
Madlenjäger
Strengelersacker
Rütirain
Oris
Fuchsacker
Stockbrunnenrain
Langenhag
Weid
Meisen
Holder
Ringlichen
Fuchsmatt
Elbis
Buech
Chilchhöfli
Bruderholz
Gempen
Laimatt
Tal
Haupt
Etzmatten
Graben
Aesch Bannacker
Reben
Bündtenhag
Bieglin
Bromberg
Hohle
Zunftacker
Batterie
Buchen
Bahnhof
Lärchen Häuli
Bolzenried
Gehren
Seeber
Bärenbrunnen
Gempen
Paradies
Spittellache
Herrenmatt
Prediger
Beunden
Schul
Chläberen
Vogtsmatten
Langgarben
Oberkleinfeld
Neu
Bois Pouche
Stallen
Fehlmann
Waldenburger
Gärten
Leimgruben
Schibenstand
Baumgarten
Industrie
Nenzlinger
Matten
Bogen
Bodenacker
Hohl
Chrummischleifi
Römer
General Guisan
Titterter
Buttertal
Gewerbe
Höhen
Sissacher
Linden
Freidorf
Kirch
similarity of (95, 'Kirch') with kapelle is: 0.7206696098771028
Greifel
Badacher
Heiden
Geissmatt
Schul
Rain
Lee
Häfelfinger
Garten
Zündhollen
Schwirt

## Classifying special street designations

In [33]:
spacy = pd.read_csv('../Datapreparation/spacy.csv', encoding='UTF-8-SIG', sep=';')

In [34]:
#wiki = pd.read_csv('wiki.csv', encoding='UTF-8-SIG', sep=';')

In [35]:
mapping = pd.merge(classif, spacy, on='STR_ESID')

In [36]:
mapping = reduce(lambda left, right:     # Merge three pandas DataFrames
                     pd.merge(left , right,
                              on = ["STR_ESID"],
                              how = "outer"),
                     [streetnames, spacy, classif])

In [37]:
mapping.head(30)

Unnamed: 0,Unnamed: 0_x,STR_ESID,STN_LABEL_x,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,...,SPACY_FR_LEMMA,SPACY_FR_ENT,STN_LABEL,STN_LABEL_FINAL,DIC_Lebewesen,DIC_Ortschaft,DIC_Berufe,DIC_Siedlung,DIC_Religion,DIC_Natur
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,...,Wieden,,Wiedenweg,Wieden,,,,,,
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,...,Wuhrbärgli,,Wuhrbärgli,Wuhrbärgli,,,,,,bärgli
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,...,emanuelen,,Emanuelenweg,Emanuelen,,,,,,
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,...,Löhr,PER,Löhrweg,Löhr,,,,,,
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,...,Brunn,,Brunngasse,Brunn,,,,,,
5,17,10118344,Beckengässli,2882,Bennwil,BL,True,2625714.0,1250261.0,Beckengässli,...,Becken,,Beckengässli,Becken,,,,,,
6,19,10103841,Poststrasse,2831,Pratteln,BL,True,2620726.0,1264894.0,Poststrasse,...,Post,,Poststrasse,Post,,,,post,,
7,20,10103030,Bromhübelweg,2763,Arlesheim,BL,True,2613595.0,1260018.0,Bromhübelweg,...,Bromhübel,,Bromhübelweg,Bromhübel,,,,,,
8,21,10108763,Zurlindenstrasse,2831,Pratteln,BL,True,2619392.0,1264175.0,Zurlindenstrasse,...,Zurlinden,,Zurlindenstrasse,Zurlinden,,,,,,
9,22,10209066,Bergli,1323,Wollerau,SZ,True,2694144.0,1226051.0,Bergli,...,Bergli,,Bergli,Bergli,,,,,,


In [38]:
mapping.loc[mapping['DIC_Ortschaft'].notna() , 'classif'] = 'Ortschaft'
mapping.loc[mapping['SPACY_FR_ENT'].notna() , 'classif'] = mapping.SPACY_FR_ENT
mapping.loc[mapping['DIC_Berufe'].notna() , 'classif'] = 'Beruf'
mapping.loc[mapping['DIC_Lebewesen'].notna() , 'classif'] = 'Lebewesen'
mapping.loc[mapping['DIC_Siedlung'].notna() , 'classif'] = 'Siedlung'
mapping.loc[mapping['DIC_Religion'].notna() , 'classif'] = 'Religion'
mapping.loc[mapping['DIC_Natur'].notna() , 'classif'] = 'Natur'

In [39]:
mapping

Unnamed: 0,Unnamed: 0_x,STR_ESID,STN_LABEL_x,COM_FOSNR,COM_NAME,COM_CANTON,STR_OFFICIAL,STR_EASTING,STR_NORTHING,STN_LABEL_NO_BI,...,SPACY_FR_ENT,STN_LABEL,STN_LABEL_FINAL,DIC_Lebewesen,DIC_Ortschaft,DIC_Berufe,DIC_Siedlung,DIC_Religion,DIC_Natur,classif
0,1,10023770,Wiedenweg,2786,Grellingen,BL,True,2610733.0,1254311.0,Wiedenweg,...,,Wiedenweg,Wieden,,,,,,,
1,2,10179192,Wuhrbärgli,2788,Liesberg,BL,True,2598709.0,1249640.0,Wuhrbärgli,...,,Wuhrbärgli,Wuhrbärgli,,,,,,bärgli,DIC_Natur
2,9,10140563,Emanuelenweg,2829,Liestal,BL,True,2623078.0,1257558.0,Emanuelenweg,...,,Emanuelenweg,Emanuelen,,,,,,,
3,13,10069457,Löhrweg,2850,Känerkinden,BL,True,2630229.0,1251411.0,Löhrweg,...,PER,Löhrweg,Löhr,,,,,,,PER
4,15,10096235,Brunngasse,2833,Seltisberg,BL,True,2621406.0,1256852.0,Brunngasse,...,,Brunngasse,Brunn,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171867,221269,10107114,Tannenstrasse,4471,Bischofszell,TG,True,2736565.0,1261881.0,Tannenstrasse,...,,Tannenstrasse,Tannen,,Tann,,,,,DIC_Ortschaft
171868,221270,10070806,Rebenstrasse,4724,Eschlikon,TG,True,2716120.0,1258548.0,Rebenstrasse,...,,Rebenstrasse,Reben,,,,,,,
171869,221272,10142291,Fritz-Gegauf-Strasse,4864,Steckborn,TG,True,2715726.0,1280124.0,Fritz-Gegauf-Strasse,...,PER,Fritz-Gegauf-Strasse,Fritz Gegauf,,,,,,,PER
171870,221274,10093237,Obere Bleichewiese,4571,Gachnang,TG,True,2705606.0,1266777.0,Obere Bleichewiese,...,,Obere Bleichewiese,Bleichewiese,,,,,,,


In [40]:
mapping.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Modeling\out_mapping.csv', encoding='UTF-8-SIG', sep=';')