# Lemmatisation with spaCy

Code taken from Sanja Sarić, see https://github.com/SanjaSaric/HSA-Topics/blob/master/Lemmatisierung.ipynb.

In [20]:
import warnings
warnings.filterwarnings('ignore')
from cophi_toolbox import preprocessing
import metadata_toolbox.utils as metadata
import pandas as pd
from pathlib import Path

## Load and read data

In [21]:
data = 'C:/Users/elisa/DH-MA/data/final'
path_to_corpus = Path(data, 'docsouth-clean') # Create backup first since data will be overwritten
pattern = '{abbr}-{author}-{author2}'
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])

In [22]:
meta[:10]

Unnamed: 0,abbr,author,author2
C:\Users\elisa\DH-MA\data\final\docsouth-clean\church-hatcher-hatcher.txt,church,hatcher,hatcher
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-ball-ball.txt,fpn,ball,ball
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-brownw-brown.txt,fpn,brownw,brown
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-bruce-bruce.txt,fpn,bruce,bruce
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-burton-burton.txt,fpn,burton,burton
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-burtont-burton.txt,fpn,burtont,burton
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-ferebee-ferebee.txt,fpn,ferebee,ferebee
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-grandy-grandy.txt,fpn,grandy,grandy
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-hortonlife-horton.txt,fpn,hortonlife,horton
C:\Users\elisa\DH-MA\data\final\docsouth-clean\fpn-hortonpoem-hortonpoem.txt,fpn,hortonpoem,hortonpoem


In [23]:
len(meta)

278

## Lemmatisation

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [25]:
# Increase the max_length limit
nlp.max_length = 2000000 

### Lemmatise and save files

In [26]:
for file in path_to_corpus.glob('*.txt'):
    with open(file, encoding='utf-8') as f:
        original = f.read()
        lemmatized_object = nlp(original)
        lemma_list = []
        for lemma in lemmatized_object:
            lemma_list.append(lemma.lemma_)
        lemma_doc = ' '.join(lemma_list)
    with open(file, 'w', encoding='utf-8') as f: # Attention: Files will be overwritten
        f.write(lemma_doc)