# Lemmatisation with spaCy

Code taken from Sanja Sarić, see https://github.com/SanjaSaric/HSA-Topics/blob/master/Lemmatisierung.ipynb.

In [15]:
import warnings
warnings.filterwarnings('ignore')
from cophi_toolbox import preprocessing
import metadata_toolbox.utils as metadata
import pandas as pd
from pathlib import Path

## Load and read data

In [16]:
data = 'C:/Users/elisa/DH-MA/data/final'
path_to_corpus = Path(data, 'fwp-clean-spellcheck') # Create backup first since data will be overwritten
pattern = '{state}_{vol_no}'
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])

In [17]:
meta[:10]

Unnamed: 0,state,vol_no
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\alabama_1.txt,alabama,1
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_1.txt,arkansas,1
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_2.txt,arkansas,2
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_3.txt,arkansas,3
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_4.txt,arkansas,4
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_5.txt,arkansas,5
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_6.txt,arkansas,6
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\arkansas_7.txt,arkansas,7
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\florida_1.txt,florida,1
C:\Users\elisa\DH-MA\data\final\fwp-clean-spellcheck\georgia_1.txt,georgia,1


In [18]:
len(meta)

33

## Lemmatisation

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

### Lemmatise and save files

In [20]:
for file in path_to_corpus.glob('*.txt'):
    with open(file, encoding='utf-8') as f:
        original = f.read()
        lemmatized_object = nlp(original)
        lemma_list = []
        for lemma in lemmatized_object:
            lemma_list.append(lemma.lemma_)
        lemma_doc = ' '.join(lemma_list)
    with open(file, 'w', encoding='utf-8') as f: # Attention: Files will be overwritten
        f.write(lemma_doc)