# Using spacy to remove stopwords, add lemma and entity


## Load Dataset
Load dataset created in ../Datapreparation/Extract_street_terms_from_street_names.ipynb

In [1]:
#Load all needed libraries
import pandas as pd
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
from IPython.display import Javascript
from pandarallel import pandarallel
import numpy as np

In [2]:
streetnames = pd.read_csv('streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [3]:
#Remove unused columns
streetnames = streetnames.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)

In [4]:
#Drop nan
missing_data = pd.DataFrame(
    streetnames.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

streetnames = streetnames.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             415


### Initialize System

Create subset for e.g. for testing, since spacy is slow for larger datasets

In [5]:
#streetnames = streetnames[:100].copy()

In [6]:
streetnames.head()

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL
0,10023770,Wiedenweg,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen
3,10069457,Löhrweg,Löhr
4,10096235,Brunngasse,Brunn


Use of `pandarallel` to run query in parallel. This needs more CPU but is much faster.

In [7]:
pandarallel.initialize()
#pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Using spaCy Deutsch

In [8]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [9]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [10]:
streetnames['SPACY_DE'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.strip()

### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

In [11]:
def checkForLemma(num):
    doc = nlp(num)
    #print(len(doc))
    if len(doc)>0:
        return doc[0].lemma_

⚠ Attention: Following code may take a while (>5min..)  

In [12]:
import ipywidgets
import datetime
#pandarallel.initialize(progress_bar=True)
starttime = datetime.datetime.now()

streetnames["SPACY_DE_LEMMA"] = streetnames["SPACY_DE"].parallel_apply(checkForLemma)
#streetnames["SPACY_DE_LEMMA"] = streetnames["SPACY_DE"].apply(checkForLemma)

print ("Duration: ", datetime.datetime.now()-starttime)

  yield data[chunk_]


Duration:  0:05:42.254822


In [13]:
streetnames.head()

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA
0,10023770,Wiedenweg,Wieden,Wieden,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel
3,10069457,Löhrweg,Löhr,Löhr,Löhr
4,10096235,Brunngasse,Brunn,Brunn,Brunn


### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

In [14]:
def checkForEntity(num):
    doc = nlp(num)
    #print(len(doc))
    if doc.ents:
        #print(num, "ENT")
        for ent in doc.ents:
            if len(num) == ent.end_char: #will be not equal if nlp did not recoginse a two word term as one entity. e.g.: General Guisan
                return ent.label_

⚠ Attention: Following code may take a while (>5min..)  

In [15]:
starttime = datetime.datetime.now()

streetnames["SPACY_DE_ENT"] = streetnames["SPACY_DE"].parallel_apply(checkForEntity)
#temp = streetnames["SPACY_DE"].apply(checkForEntity)

print ("Duration: ", datetime.datetime.now()-starttime)

  yield data[chunk_]


Duration:  0:05:16.167742


In [16]:
streetnames.head(20)

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA,SPACY_DE_ENT
0,10023770,Wiedenweg,Wieden,Wieden,Wieden,
1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,
2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel,
3,10069457,Löhrweg,Löhr,Löhr,Löhr,
4,10096235,Brunngasse,Brunn,Brunn,Brunn,
5,10118344,Beckengässli,Becken,Becken,Becken,
6,10103841,Poststrasse,Post,Post,Post,
7,10103030,Bromhübelweg,Bromhübel,Bromhübel,Bromhübel,
8,10108763,Zurlindenstrasse,Zurlinden,Zurlinden,zurlinden,
9,10209066,Bergli,Bergli,Bergli,Bergli,


## Using spaCy Français

In [17]:
#Load a French language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('fr_core_news_md') #Change fr/de

In [18]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [19]:
streetnames['SPACY_FR'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.strip()

In [20]:
streetnames.head()

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA,SPACY_DE_ENT,SPACY_FR
0,10023770,Wiedenweg,Wieden,Wieden,Wieden,,Wieden
1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,,Wuhrbärgli
2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel,,Emanuelen
3,10069457,Löhrweg,Löhr,Löhr,Löhr,,Löhr
4,10096235,Brunngasse,Brunn,Brunn,Brunn,,Brunn


### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

⚠ Attention: Following code may take a while (>5min..)  

In [28]:
starttime = datetime.datetime.now()

streetnames["SPACY_FR_LEMMA"] = streetnames["SPACY_FR"].parallel_apply(checkForLemma)
#temp = streetnames["SPACY_DE"].apply(checkForEntity)

print ("Duration: ", datetime.datetime.now()-starttime)

  yield data[chunk_]


Duration:  0:05:22.379907


### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

⚠ Attention: Following code may take a while (>5min..)  

In [22]:
starttime = datetime.datetime.now()

streetnames["SPACY_FR_ENT"] = streetnames["SPACY_FR"].parallel_apply(checkForEntity)
#temp = streetnames["SPACY_DE"].apply(checkForEntity)

print ("Duration: ", datetime.datetime.now()-starttime)

  yield data[chunk_]


Duration:  0:05:10.791635


In [23]:
streetnames.head(20)

Unnamed: 0,STR_ESID,STN_LABEL,STN_LABEL_FINAL,SPACY_DE,SPACY_DE_LEMMA,SPACY_DE_ENT,SPACY_FR,SPACY_FR_LEMMA,SPACY_FR_ENT
0,10023770,Wiedenweg,Wieden,Wieden,Wieden,,Wieden,,
1,10179192,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,Wuhrbärgli,,Wuhrbärgli,,
2,10140563,Emanuelenweg,Emanuelen,Emanuelen,Emanuel,,Emanuelen,,
3,10069457,Löhrweg,Löhr,Löhr,Löhr,,Löhr,PER,PER
4,10096235,Brunngasse,Brunn,Brunn,Brunn,,Brunn,,
5,10118344,Beckengässli,Becken,Becken,Becken,,Becken,,
6,10103841,Poststrasse,Post,Post,Post,,Post,,
7,10103030,Bromhübelweg,Bromhübel,Bromhübel,Bromhübel,,Bromhübel,,
8,10108763,Zurlindenstrasse,Zurlinden,Zurlinden,zurlinden,,Zurlinden,,
9,10209066,Bergli,Bergli,Bergli,Bergli,,Bergli,,


## Check results

In [24]:
most_common = streetnames.groupby("SPACY_FR_ENT").count()
most_common.STR_ESID

SPACY_FR_ENT
LOC      7458
MISC     2417
ORG      1425
PER     13298
Name: STR_ESID, dtype: int64

In [25]:
most_common = streetnames.groupby("SPACY_DE_ENT").count()
most_common.STR_ESID

SPACY_DE_ENT
LOC     9831
MISC    1302
ORG      774
PER     6465
Name: STR_ESID, dtype: int64

In [30]:
streetnames.to_csv('spacy_out.csv', encoding='UTF-8-SIG', sep=';')