In [145]:
'''
loading config
'''

from config import (
    DATA_DIR, METADATA_PATH,
    OUTPUT_DIR, VISUALS_DIR, CSV_DIR,
    NUM_TOPICS, MAX_DF, MIN_DF
)

In [146]:
'''
loading text chunks
'''

from pathlib import Path

texts = []
filenames = []

for file in sorted(DATA_DIR.glob("*.txt")):
    with open(file, "r", encoding="utf-8") as f:
        texts.append(f.read())
        filenames.append(file.name)

#### spaCy
The following script was used to generate tokens needed for the final topic modeling. 
Certain words like "herr", "hand", or persons' names like "Klamm" (the famous beaurocrat of the novel "the castle")
were seen in the topics words that did not halp getting any insight into the topics. 
Also verbs are filtered out in the script to avoid repititions in the topic words. 

#### tokenizer
other models like "de_core_news_md"  and "de_core_news_lg" were also experimented but the final one
seen in the script below yielded in the best results.

In [None]:
import spacy

#nlp = spacy.load("de_core_news_sm")
nlp = spacy.load("de_core_news_lg")

# using common stop-words to filter them out
CUSTOM_STOPWORDS = {
    "hand", "herr", "delamarch", "pollunder", "auge", "klamm", "frau", "gesicht", "mann", "sehen",
    "fragen", "wissen", "barnabas", "sagen", "stehen", "gehen", "kommen", "zimmer",
}

def preprocess(text):
    doc = nlp(text)
    return " ".join([
        lemma for token in doc
        if token.pos_ in ["NOUN", "ADJ"]
        and not token.is_stop
        and token.is_alpha
        and len(token.lemma_) > 3
        and (lemma := token.lemma_.lower()) not in CUSTOM_STOPWORDS
    ])

texts_cleaned = [preprocess(text) for text in texts]

In [148]:
'''
save cleaned texts to CSV
'''

import pandas as pd

df_cleaned = pd.DataFrame({
    "filename": filenames,
    "text": texts_cleaned
})

df_cleaned.to_csv(CSV_DIR / "cleaned_chunks.csv", index=False)

#### A broader preprocessing
The following script is for a broader tokenization of texts. Verbs, nouns and adjectives are taken into account,
but it results in many VERBS in the topic word lists. Hence another approach was taken by filtering
out the verbs and common words. 
This script can be run to compare the topic words with the filtering method used in the paper.

In [None]:
''' 
Preprocessing using spaCy (broader filtering)
'''
import spacy

# loading German spacy model
nlp = spacy.load("de_core_news_sm")  # or "de_core_news_md" / "de_core_news_lg" or "de_core_news_sm"

def preprocess(text):
    doc = nlp(text)
    return " ".join([
        token.lemma_.lower()
        for token in doc
        #if token.pos_ in ["NOUN"]
        if token.pos_ in ["NOUN", "VERB", "ADJ"]
        and not token.is_stop
        and token.is_alpha
    ])

texts_cleaned = [preprocess(text) for text in texts]