# 💻 Einführung in Topic Modeling mit Python - Step 2: Train the model



In [None]:
# import libraries 

import pandas as pd # load source data

import gensim.corpora as corpora 
from gensim.models import LdaModel, CoherenceModel # evaluation
from gensim.test.utils import datapath # to save model data

from pprint import pprint # "pretty print" additional output method

## Daten einlesen und vorbereiten

In [None]:
# import data
source = pd.read_csv("../daten/speeches-bundesregierung_preprocessed.csv", encoding="utf-8")

In [None]:
source.head()

In [None]:
# extract speeches
speeches = source.loc[:, "preprocessed_text"].tolist()

# convert strings to word lists
speeches_list = []
for speech in speeches:
    speeches_list.append(speech.split())

print(speeches_list[-3])

## Textdaten quantifizierbar machen

1. Wort-ID-Dictionary erzeugen
2. Auf Basis des Wort-ID-Dictionaries wird jedes Dokument umgewandelt in eine Bag-of-Words der Form: ID(=Wort): Häufigkeit im Dokument.

In [None]:
# create dictionary where an unique id is assigned to each word in the corpus 
# this dicitionary is used as a reference in the modeling-process
id2word = corpora.Dictionary(speeches_list)

# save for future reference, update path
#from datetime import datetime
#id2word.save(f"{datetime.now()}-FILENAME.dict")

# load
#id2word.load(r"PATH.dict")

# view id2word-content
#print(id2word.token2id)

# create text corpus 
text_corpus = speeches_list

In [None]:
# term document frequency (bag of words) -> converts tokenized documents to sparse vector

corpus = []

for text in text_corpus:
    corpus.append(id2word.doc2bow(text))

#corpus = [id2word.doc2bow(text) for text in text_corpus] # above for-loop as list comprehension

# save doc2bow for future use, update path
#corpora.MmCorpus.serialize(f"{datetime.now()}-FILENAME", corpus)  

# View
print(f"Die Bag of Words für obiges Dokument sieht für die Topic-Modellierung so aus:\n {corpus[-3]}")

## Exemplarisches Topic-Modell berechnen

In [None]:
# train topic model, may take some time in Binder 
# takes up to 30 minutes (with 2000 iterations), depending on memory usage and internet speed

ldamodel = LdaModel(corpus=corpus,     # bag of words
                    num_topics=10,     # number of topics to be extracted from corpus, default=100
                    id2word=id2word,   # dictionary
                    alpha="auto",      # insert: "symmetric" (default), "asymmetric" or "auto"
                    iterations=50,     # default: 50
                    random_state=100,  # useful for reproducibility
                    chunksize=500,     # number of documents to be used in each training chunk, default=2000 
                    passes=10)         # how many times should the algorithm pass over the whole corpus, default=1, n > 1 slows down modeling process

In [None]:
# save model (four files) to temporary files - can be copied and pasted 

#temp_file = datapath("topic-model")
#ldamodel.save(temp_file)

# load model
#ldamodel = LdaModel.load("UPDATE PATH")

### Inspektion des Topic-Modells

In [None]:
# show all topics with 30 most dominant words
topics = ldamodel.print_topics(num_topics=-1, num_words=30)
pprint(topics)

In [None]:
import csv

# extract topics for inspection
topics = ldamodel.print_topics(num_topics=-1, num_words=30)

with open("../daten/topic-model/topics.csv", "w", encoding="utf-8", newline="") as file:
    csv_writer = csv.writer(file, delimiter="+")
    for i, e in topics:
        i = "Topic " + str(i)
        csv_writer.writerow([i, e])
file.close()

## Evaluation des Topic-Modells

In [None]:
# compute coherence score
# computation takes some time (especially in Binder)

coherence_model_lda = CoherenceModel(model=ldamodel, texts=text_corpus, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

## Extraktion der Dokument-Topic-Matrix

In [None]:
doc_topics = []

for doc in corpus:
    # extract all topics with corresponding distribution, specify minimum_probability on order to fetch all distribution values
    doc_topics.append(ldamodel.get_document_topics(doc, minimum_probability=0.0)) 
    
print(doc_topics[0]) # distribution values represented as complex numbers - will be transformed in pandas

In [None]:
from collections import defaultdict 

# create a dictionary-like object which is given a list as initial value to easily add data
dt_dict = defaultdict(list) 

for doc in doc_topics:
    for tuples in doc: 
        dt_dict[f"Topic {tuples[0]}"].append(tuples[1])
        
dt_dict["Topic 0"][:3]

In [None]:
# create pandas dataframe from dictionary, keys=column names, each row represents one document

dt_matrix = pd.DataFrame.from_dict(dt_dict) 
dt_matrix

In [None]:
# add metadata
dt_matrix["date"] = source.date
dt_matrix["year"] = source.year

dt_matrix.head(3)

In [None]:
# save dataframe for future use
dt_matrix.to_csv("../daten/topic-model/document-topic-matrix.csv")

## Visualisierung eines Topics im Verlauf des zeitlich geordneten Korpus

In [None]:
# import data
dt_matrix = pd.read_csv("../daten/topic-model/document-topic-matrix.csv", encoding="utf-8")

In [None]:
# set index 
dt_matrix = dt_matrix.set_index("year")

In [None]:
# visualize topic; important: the data is not normalized 
import matplotlib.pyplot as plt

plt.figure(figsize=(15,7), dpi=360)

dt_matrix["Topic 5"].plot(label="Rohdaten", color="#2F4F4F", linewidth=3, marker="o")
#dt_matrix.groupby("year")["Topic 5"].median().plot(label="Median", color="blue", linewidth=3, marker="o") 
#dt_matrix.groupby("year")["Topic 5"].mean().plot(label="Mean", color="magenta", linewidth=3, marker="o") 

plt.title("Topic-Verteilung auf Basis der Rohdaten pro Jahr", fontsize=22, fontstyle="oblique")
plt.xlabel("Jahr", fontsize=18, fontstyle="italic")
plt.ylabel("Distribution", fontsize=18, fontstyle="italic")
plt.legend(fontsize=15)
plt.grid(True)
plt.tight_layout()
plt.show()

## Visualisierung des Topic-Modells mit pyLDAvis

In [None]:
# visualize topic model
import pyLDAvis
import pyLDAvis.gensim_models

vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, id2word)
pyLDAvis.display(vis)

In [None]:
pyLDAvis.save_html(vis, "../daten/topic-model/topic-model.html")

## 📝 Abschlussaufgabe: 
Probieren Sie gerne einmal verschiedene Parameter-Einstellungen für die *Topic-Anzahl*, *chunksize* oder *alpha* oben im Codeblock aus, indem Sie einfach die Parameterwerte verändern.
Vergleichen Sie, wie sich das Ergebnis verändert. Setzen Sie ggf. "passes" auf den Default-Wert von 1, um die Berechnungsgeschwindkigkeit in Binder zu optimieren.

#### Zusätzliche Parameter können sein:
- eval_every=10 # evaluate model based on perplexity every n iterations - slows down modeling process 
- update_every=1 # update the model every given n for chunksize chunks, default=1 


**Abschließender Hinweis:** Topic-Modeling ist ein statistisches Verfahren, das heißt, je mehr Daten, desto belastbarer sind die Modellierungsergebnisse. Dieses Korpus aus Reden ist nicht ideal. Es ist relativ klein und die Daten noch nicht optimal aufbereitet. Haben Sie vielleicht ein eigenes Datenkorpus? Dann probieren Sie gerne diesen Workflow anhand Ihrer Daten aus.