## Full Script

In [1]:
from config import DATA_DIR, METADATA_PATH, OUTPUT_DIR, VISUALS_DIR, CSV_DIR, NUM_TOPICS, MAX_DF, MIN_DF
import os
from pathlib import Path
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

for path in [OUTPUT_DIR, VISUALS_DIR, CSV_DIR]:
    os.makedirs(path, exist_ok=True)

texts = []
filenames = []

for file in sorted(DATA_DIR.glob("*.txt")):
    with open(file, "r", encoding="utf-8") as f:
        texts.append(f.read())
        filenames.append(file.name)

try:
    nlp = spacy.load("de_core_news_sm")
except:
    print("Run: python -m spacy download de_core_news_sm")
    raise

def preprocess(text):
    doc = nlp(text)
    return " ".join([
        token.lemma_.lower() for token in doc
        if token.pos_ in ["NOUN", "VERB", "ADJ"]
        and not token.is_stop
        and token.is_alpha
    ])

texts_cleaned = [preprocess(text) for text in texts]

vectorizer = CountVectorizer(max_df=MAX_DF, min_df=MIN_DF)
X = vectorizer.fit_transform(texts_cleaned)

lda = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)
lda.fit(X)

def print_topics(model, vectorizer, top_n=10):
    feature_names = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic #{idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]))

print_topics(lda, vectorizer)




Topic #0:
kaiser mauer bau sehen dorf wissen leben volk mann heimat

Topic #1:
offizier reisend verurteilter soldat kommandant hand reisende apparat sehen mann

Topic #2:
herr diener tür akte gang bleiben scheinen wirt zimmer wirtin

Topic #3:
klamm wissen wirtin arbeit sehen scheinen herr glauben sagen fragen

Topic #4:
delamarch fragen lehrer hund mann halten scheinen gehilfe rufen frage

Topic #5:
advokat kaufmann fragen block hand theater sprechen prozeß wissen pferd

Topic #6:
welt leben mensch glauben böse herr haus weg alt wissen

Topic #7:
herr zimmer hand tür vater wissen sehen fragen sagen scheinen

Topic #8:
herr hand fragen mann tür frau sehen prozeß gericht glauben

Topic #9:
onkel hand fragen sehen freund delamarch koffer wissen herr scheinen
