# Text cleaning

Wir müssen unsere Texte noch säubern:
- Stopwords
- Lemmatisierung
- Namen müssen raus

## Steps

- Lade Texte und Metadaten
- Lemmatisiere Texte
    - speicher diese Texte in Ordner ab
- Bereinige bereinigte Texte von Namen
    - speicher die Texte in weiterem Ordner ab
    
### Lade Texte und Metadaten

In [1]:
import pandas as pd

meta = pd.read_csv("metadata_lyrik.csv", encoding = "utf-8")
meta.head()

Unnamed: 0,id,author,title,genre,date,file,period
0,1,"Ebeling, Johann Justus",N.A.,lyrik,1747,dta.poem.1.txt,Aufklaerung
1,2,"Ebeling, Johann Justus",Der Sommer.,lyrik,1747,dta.poem.2.txt,Aufklaerung
2,3,"Ebeling, Johann Justus",Die mannigfaltige \n Weisheit GOttes \n im ...,lyrik,1747,dta.poem.3.txt,Aufklaerung
3,4,"Ebeling, Johann Justus",Die \n angenehme Morgenröthe \n Und \n da...,lyrik,1747,dta.poem.4.txt,Aufklaerung
4,5,"Ebeling, Johann Justus",Anrede an den herrlichen GOtt \n um Abwendung...,lyrik,1747,dta.poem.5.txt,Aufklaerung


In [2]:
# load corpus

def load_corpus(path):
    from numpy import append 

    sentences = []
    for filename in meta["file"]:
        with open(path + filename, 'r', encoding="utf8") as f:
            sentences.append(f.read())
            f.close()
    return sentences

In [3]:
texts = load_corpus("corpora/raw/corpus_lyrik/")

In [4]:
texts[0][:100]

'mein Trieb, der waget warlich\nDa ich mich untersteh der Andacht Saiten-\nDas meine Einfalt rührt, mit'

In [5]:
import spacy

nlp = spacy.load("de_core_news_lg", disable=['ner', 'parser']) 
nlp.max_length = 3000000

def cleaning(doc):
    txt = [token.lemma_ for token in doc]
    return ' '.join(txt)

In [6]:
import pandas as pd

df = pd.DataFrame(texts)

In [7]:
import numpy as np
import re
k = 10
num_samples = len(df) // k

txt = []

for i in range(k):
    print(i)

    text_batch = df[num_samples * i: num_samples * (i+1)]
    brief_cleaning = (re.sub("[^A-ZÄÜÖa-zäüöß?!.']+", ' ', str(row)).lower() for row in text_batch[0])
    txt = np.concatenate((txt, [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]))

if len(txt) != len(df):
    print("rest")
    text_batch = df[len(txt): ]
    brief_cleaning = (re.sub("[^A-ZÄÜÖa-zäüöß?!.']+", ' ', str(row)).lower() for row in text_batch[0])
    txt = np.concatenate((txt, [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]))

0
1
2
3
4
5
6
7
8
9
rest


In [8]:
txt[0][:100]

'mein treiben der Waget Warlich da ich sich Untersteh der Andacht saiten der mein Einfalt rühren mit '

In [9]:
def remove_files(path):
    import os
    for file_name in os.listdir(path):
        # construct full file path
        file = path + file_name
        if os.path.isfile(file):
            os.remove(file)

In [10]:
path = "corpora/cleaned/corpus_lyrik/"

remove_files(path)

# write lemmatized texts to files
for i in range(len(txt)):
    with open(path + meta["file"].iloc[i], 'w', encoding="utf8") as f:
            f.write(txt[i])
            f.close()

In [11]:
import shutil

def copy_files(array, src, dest):
    for file in array:
        shutil.copy(src + file, dest + file)

In [12]:
df = meta.loc[meta["period"] == "Kunstepoche"]
files = df["file"].values

In [13]:
remove_files("corpora/cleaned/corpus_lyrik_kunstepoche/")
copy_files(files, "corpora/cleaned/corpus_lyrik/", "corpora/cleaned/corpus_lyrik_kunstepoche/")

In [14]:
df = meta.loc[meta["period"] == "Aufklaerung"]
files = df["file"].values

In [15]:
remove_files("corpora/cleaned/corpus_lyrik_aufklaerung/")
copy_files(files, "corpora/cleaned/corpus_lyrik/", "corpora/cleaned/corpus_lyrik_aufklaerung/")