In [56]:
'''loading config'''

from config import (
    DATA_DIR, METADATA_PATH,
    OUTPUT_DIR, VISUALS_DIR, CSV_DIR,
    NUM_TOPICS, MAX_DF, MIN_DF)

In [25]:
'''
load cleaned texts from CSV
'''

import pandas as pd
from config import CSV_DIR

df_cleaned = pd.read_csv(CSV_DIR / "cleaned_chunks.csv")
texts_cleaned = df_cleaned["text"].tolist()
filenames = df_cleaned["filename"].tolist()

In [26]:
# vectorizing and LDA training

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=MAX_DF, min_df=MIN_DF)
X = vectorizer.fit_transform(texts_cleaned)

lda = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)
lda.fit(X)

## topics

In [27]:
def print_topics(model, vectorizer, top_n=10):
    feature_names = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic #{idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]))

print_topics(lda, vectorizer)


Topic #0:
kaiser mauer bau sehen dorf wissen leben volk mann heimat

Topic #1:
offizier reisend verurteilter soldat kommandant hand reisende apparat sehen mann

Topic #2:
herr diener tür akte gang bleiben scheinen wirt zimmer wirtin

Topic #3:
klamm wissen wirtin arbeit sehen scheinen herr glauben sagen fragen

Topic #4:
delamarch fragen lehrer hund mann halten scheinen gehilfe rufen frage

Topic #5:
advokat kaufmann fragen block hand theater sprechen prozeß wissen pferd

Topic #6:
welt leben mensch glauben böse herr haus weg alt wissen

Topic #7:
herr zimmer hand tür vater wissen sehen fragen sagen scheinen

Topic #8:
herr hand fragen mann tür frau sehen prozeß gericht glauben

Topic #9:
onkel hand fragen sehen freund delamarch koffer wissen herr scheinen


In [28]:
# saving top words per topic to CSV
top_n = 10
vocab = vectorizer.get_feature_names_out()

topic_keywords = {
    f"Topic {i}": [vocab[idx] for idx in topic.argsort()[:-top_n - 1:-1]]
    for i, topic in enumerate(lda.components_)
}

topic_keywords_df = pd.DataFrame.from_dict(
    topic_keywords, orient='index',
    columns=[f"Word {i+1}" for i in range(top_n)]
)

topic_keywords_df.to_csv(CSV_DIR / "topic_keywords.csv")

In [52]:
'''Script for getting the topic distributions'''

import pandas as pd
import matplotlib.pyplot as plt
from config import CSV_DIR, VISUALS_DIR, NUM_TOPICS
import os

# creating topic names
topic_names = [f"Topic {k}" for k in range(NUM_TOPICS)]
vocabulary = vectorizer.get_feature_names_out()

# topic-word-distribution
topic_word_distributions = pd.DataFrame(
    lda.components_, columns=vocabulary, index=topic_names
)

# document-topic-distribution
document_topic_matrix = lda.transform(X)
document_topic_distributions = pd.DataFrame(
    document_topic_matrix, columns=topic_names, index=filenames
)
# save DataFrames
topic_word_distributions.to_csv(CSV_DIR / "topic_word_distributions.csv")
document_topic_distributions.to_csv(CSV_DIR / "document_topic_distributions.csv")

In [48]:
# creating and saving document-topic distribution with dominant topic
doc_topic_with_dominant = document_topic_distributions.copy()
doc_topic_with_dominant["dominant_topic"] = doc_topic_with_dominant.idxmax(axis=1)

doc_topic_with_dominant.to_csv(CSV_DIR / "document_topic_distributions_with_dominant.csv")