# Using spacy to remove stopwords, add lemma and entity


## Load Dataset
Load dataset created in ../Datapreparation/Extract_street_terms_from_street_names.ipynb

In [None]:
#Load all needed libraries
import pandas as pd
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
from IPython.display import Javascript
from pandarallel import pandarallel
import numpy as np

In [None]:
streetnames = pd.read_csv('streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.columns

In [None]:
#Remove unused columns
streetnames = streetnames.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)

In [None]:
#Drop nan
missing_data = pd.DataFrame(
    streetnames.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

streetnames = streetnames.dropna()

## Using spaCy Deutsch

In [None]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [None]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [None]:
streetnames['SPACY_DE'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.strip()

In [None]:
#Create subset since spacy is slow for larger datasets
streetnames100 = streetnames[:100].copy()

#Make ist faster
pandarallel.initialize()
#pandarallel.initialize(progress_bar=True)

### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

In [None]:
def checkForLemma(num):
    doc = nlp(num)
    #print(len(doc))
    if len(doc)>0:
        return doc[0].lemma_

⚠ Attention: This following code may take a while (>5min..)  
Use `parallel_apply` from `pandarallel` to run query in parallel. This needs more CPU but is much faster.

In [None]:
import ipywidgets
import datetime
#pandarallel.initialize(progress_bar=True)
starttime = datetime.datetime.now()

temp = streetnames["SPACY_DE"].parallel_apply(checkForLemma)
#temp = streetnames["SPACY_DE"].apply(checkForLemma)

print ("Duration: ", datetime.datetime.now()-starttime)

In [None]:
temp2 = temp.rename("SPACY_DE_LEMMA")

In [None]:
streetnames = pd.concat((streetnames,temp2), axis=1)

In [None]:
streetnames.head()

### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

In [None]:
def checkForEntity(num):
    doc = nlp(num)
    #print(len(doc))
    if doc.ents:
        #print(num, "ENT")
        for ent in doc.ents:
            if len(num) == ent.end_char: #will be not equal if nlp did not recoginse a two word term as one entity. e.g.: General Guisan
                return ent.label_

In [None]:
starttime = datetime.datetime.now()

temp = streetnames["SPACY_DE"].parallel_apply(checkForEntity)
#temp = streetnames["SPACY_DE"].apply(checkForEntity)

print ("Duration: ", datetime.datetime.now()-starttime)

In [None]:
temp2 = temp.rename("SPACY_DE_ENTITY")

In [None]:
streetnames = pd.concat((streetnames,temp2), axis=1)

In [None]:
streetnames.head(20)

## Using spaCy Français

In [None]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('fr_core_news_md') #Change fr/de

In [None]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [None]:
streetnames['SPACY_FR'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.strip()

### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

In [None]:
#for i in streetnames100.SPACY_FR.items():
for i in streetnames.SPACY_FR.items():
    doc = nlp(i[1])
    for token in doc:
        #print(f"{token.text:<20}\t{token.lemma_:<20}\t{token.pos_:<6}\t{token.is_stop}")
        #streetnames100.loc[i[0], 'SPACY_DE_LEMMA'] = token.lemma_
        streetnames.loc[i[0], 'SPACY_FR_LEMMA'] = token.lemma_


### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

In [None]:
#for i in streetnames100.SPACY_FR.items():
for i in streetnames.SPACY_FR.items():
    doc = nlp(i[1])
    #print(i[0])
    for ent in doc.ents:
        #print(f"{ent.text:<20}\t{ent.label_:<3}")
        #streetnames100.loc[i[0], 'SPACY_DE_ENT'] = ent.label_
        streetnames.loc[i[0], 'SPACY_FR_ENT'] = ent.label_

## Check results

In [None]:
most_common = streetnames.groupby("SPACY_FR_ENT").count()
most_common.STR_ESID

In [None]:
most_common = streetnames.groupby("SPACY_DE_ENT").count()
most_common.STR_ESID

In [None]:
streetnames.to_csv('spacy-out.csv', encoding='UTF-8-SIG', sep=';')