# Text cleaning

Wir müssen unsere Texte noch säubern:
- Stopwords
- Lemmatisierung
- Namen müssen raus

## Steps

- Lade Texte und Metadaten
- Lemmatisiere Texte
    - speicher diese Texte in Ordner ab
- Bereinige bereinigte Texte von Namen
    - speicher die Texte in weiterem Ordner ab
    
### Lade Texte und Metadaten

In [1]:
import pandas as pd

meta = pd.read_csv("meta_epik.csv", encoding = "utf-8")
meta.head()

Unnamed: 0.1,Unnamed: 0,id,title,author,author_birth_year,period,type,genre,date,file,source,annotation,tokens_cleaned
0,34,35,"Armut, Reichtum, Schuld und Buße der Gräfin Do...","Arnim, Achim",1781,Kunstepoche,Roman,epik,1810.0,Arnim_GraefinDolores.txt,https://www.projekt-gutenberg.org/arnim/dolore...,,170246
1,93,94,Das Leben der Hochgräfin Gritta von Rattenzuha...,"Arnim, Gisela",1827,Kunstepoche,Roman,epik,1840.0,Arnim_DasLebenDerHochgraefin.txt,,,54832
2,108,109,Evremont,Bernhardi,1775,Kunstepoche,Roman,epik,1836.0,Bernhardi_Evremont.txt,"http://www.zeno.org/Literatur/M/Bernhardi,+Sop...",,230648
3,57,58,Lebensgeschichte und Natürliche Ebenteuer des ...,Braeker,1735,Kunstepoche,Roman,epik,1789.0,Braeker_Tockenburg.txt,,,68601
4,84,85,Godwi,Brentano,1778,Kunstepoche,Roman,epik,1801.0,Brentano_Godwi.txt,https://www.projekt-gutenberg.org/autoren/name...,,143505


In [2]:
# load corpus

def load_corpus(path):
    from numpy import append 

    sentences = []
    for filename in meta["file"]:
        with open(path + filename, 'r', encoding="utf16") as f:
            sentences.append(f.read())
            f.close()
    return sentences

In [3]:
texts = load_corpus("corpora/raw_normalized/corpus_epik/")

In [4]:
texts[0][:100]

'Zueignung an des Fürsten Radzivil Durchlaucht Dem Schutzgeist bleibt ein treuer Sinn ergeben , Der i'

In [5]:
import spacy

nlp = spacy.load("de_core_news_lg", disable=["NER"]) 
nlp.max_length = 3000000

test = nlp(texts[0][:250])

In [6]:
def cleaning(doc):
    txt = []
    for word in doc:
        if word.pos_ == "NOUN" or word.pos_ == "ADJ" or word.pos_=="VERB":
            txt.append(word.lemma_)
        else:
            continue
    return ' '.join(txt)

In [7]:
import pandas as pd

df = pd.DataFrame(texts)

In [8]:
import numpy as np
import re
k = 10
num_samples = len(df) // k

txt = []

for i in range(k):
    print(i)

    text_batch = df[num_samples * i: num_samples * (i+1)]
    brief_cleaning = (re.sub("[^A-ZÄÜÖa-zäüöß?!.']+", ' ', str(row)).lower() for row in text_batch[0])
    txt = np.concatenate((txt, [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]))

if len(txt) != len(df):
    print("rest")
    text_batch = df[len(txt): ]
    brief_cleaning = (re.sub("[^A-ZÄÜÖa-zäüöß?!.']+", ' ', str(row)).lower() for row in text_batch[0])
    txt = np.concatenate((txt, [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]))

0
1
2
3
4
5
6
7
8
9
rest


In [10]:
def remove_files(path):
    import os
    for file_name in os.listdir(path):
        # construct full file path
        file = path + file_name
        if os.path.isfile(file):
            os.remove(file)

In [11]:
path = "corpora/cleaned_normalized/corpus_epik/"

remove_files(path)

# write cleaned texts to files
for i in range(len(txt)):
    with open(path + meta["file"].iloc[i], 'w', encoding="utf8") as f:
            f.write(txt[i])
            f.close()