# Using spacy to remove stopwords, add lemma and entity


## Load Dataset
Load dataset created in ../Datapreparation/Extract_street_terms_from_street_names.ipynb

In [1]:
#Load all needed libraries
import pandas as pd
import spacy #Our NLP tools
import de_core_news_md #!python -m spacy download de_core_news_md
import fr_core_news_md #!python -m spacy download fr_core_news_md
from IPython.display import Javascript
from pandarallel import pandarallel
import numpy as np
import time
import warnings

In [2]:
streetnames = pd.read_csv('streetnames.csv', encoding='UTF-8-SIG', sep=';')
streetnames.columns

Index(['Unnamed: 0', 'STR_ESID', 'STN_LABEL', 'COM_FOSNR', 'COM_NAME',
       'COM_CANTON', 'STR_OFFICIAL', 'STR_EASTING', 'STR_NORTHING',
       'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS', 'STR_TERMS', 'STR_PREPS',
       'STN_LABEL_FINAL'],
      dtype='object')

In [3]:
#Remove unused columns
streetnames = streetnames.drop(['Unnamed: 0', 'COM_NAME', 'COM_CANTON','STR_EASTING', 'STR_NORTHING','COM_FOSNR',
                        'STR_OFFICIAL','STR_TERMS', 'STR_PREPS', 'STN_LABEL_NO_BI', 'STN_LABEL_NO_TERMS'], axis=1)

In [4]:
#Drop nan
missing_data = pd.DataFrame(
    streetnames.isnull().sum(),
    columns=['Missing Values'])

print(missing_data)

streetnames = streetnames.dropna()

                 Missing Values
STR_ESID                      0
STN_LABEL                     0
STN_LABEL_FINAL             591


## Using spaCy Deutsch

In [5]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('de_core_news_md') #Change fr/de

In [6]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [7]:
streetnames['SPACY_DE'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_DE'] = streetnames['SPACY_DE'].str.strip()

In [8]:
#Create subset since spacy is slow for larger datasets
streetnames100 = streetnames.iloc[:1000,:]

#Make ist faster
pandarallel.initialize(progress_bar=True)  

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

In [9]:
%%time

#for i in streetnames100.SPACY_DE.items():
for i in streetnames.SPACY_DE.items():
    doc = nlp(i[1])
    for token in doc:
        #print(f"{token.text:<20}\t{token.lemma_:<20}\t{token.pos_:<6}\t{token.is_stop}")
        #streetnames100.loc[i[0], 'SPACY_DE_LEMMA'] = token.lemma_
        streetnames.loc[i[0], 'SPACY_DE_LEMMA'] = token.lemma_


CPU times: total: 48min 3s
Wall time: 48min 53s


### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

In [10]:
%%time

#for i in streetnames100.SPACY_DE.items():
for i in streetnames.SPACY_DE.items():
    doc = nlp(i[1])
    #print(i[0])
    for ent in doc.ents:
        #print(f"{ent.text:<20}\t{ent.label_:<3}")
        #streetnames100.loc[i[0], 'SPACY_DE_ENT'] = ent.label_
        streetnames.loc[i[0], 'SPACY_DE_ENT'] = ent.label_

CPU times: total: 22min 6s
Wall time: 22min 16s


## Using spaCy Français

In [11]:
#Load a German language model to do NLP - the models we use will influence our results a lot
nlp = spacy.load('fr_core_news_md') #Change fr/de

In [12]:
stopwords = nlp.Defaults.stop_words

#print(len(stopwords))
#print(stopwords)

### Remove stopwords

In [13]:
streetnames['SPACY_FR'] = streetnames['STN_LABEL_FINAL'].str.split(' ')

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].apply(lambda x: [item for item in x if item not in stopwords])

streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.join(' ')

#Delete spaces at the beginning and end of the string using function 'strip()'
streetnames['SPACY_FR'] = streetnames['SPACY_FR'].str.strip()

### Lemmatization
Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

In [14]:
%%time

#for i in streetnames100.SPACY_FR.items():
for i in streetnames.SPACY_FR.items():
    doc = nlp(i[1])
    for token in doc:
        #print(f"{token.text:<20}\t{token.lemma_:<20}\t{token.pos_:<6}\t{token.is_stop}")
        #streetnames100.loc[i[0], 'SPACY_DE_LEMMA'] = token.lemma_
        streetnames.loc[i[0], 'SPACY_FR_LEMMA'] = token.lemma_


CPU times: total: 44min 30s
Wall time: 44min 37s


### Named Entity Recognition (NER)
Labelling named “real-world” objects, like persons, companies or locations.

In [15]:
%%time

#for i in streetnames100.SPACY_FR.items():
for i in streetnames.SPACY_FR.items():
    doc = nlp(i[1])
    #print(i[0])
    for ent in doc.ents:
        #print(f"{ent.text:<20}\t{ent.label_:<3}")
        #streetnames100.loc[i[0], 'SPACY_DE_ENT'] = ent.label_
        streetnames.loc[i[0], 'SPACY_FR_ENT'] = ent.label_

CPU times: total: 19min 18s
Wall time: 19min 22s


## Check results

In [16]:
most_common = streetnames.groupby("SPACY_FR_ENT").count()
most_common.STR_ESID

SPACY_FR_ENT
LOC      8593
MISC     3042
ORG      1571
PER     14109
Name: STR_ESID, dtype: int64

In [17]:
most_common = streetnames.groupby("SPACY_DE_ENT").count()
most_common.STR_ESID

SPACY_DE_ENT
LOC     10134
MISC     1490
ORG       840
PER      6746
Name: STR_ESID, dtype: int64

In [18]:
streetnames.to_csv('C:\CAS_Arbeit\cassda-zertifikatsarbeit\Datapreparation\spacy_out.csv', encoding='UTF-8-SIG', sep=';')