In [1]:
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
import re
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore

In [8]:
df = pd.read_csv("data/bundestag_wp20_speeches_preprocessed.csv")

In [15]:
monthly_slices = {month: data for month, data in df.groupby("date")}

In [16]:
monthly_slices

{'2021-10-26':     Unnamed: 0        id                title  \
 0        26225  20101400            Petra Pau   
 1        26212  20100100   Gabriele Katzmarek   
 2        26213  20100200        Stefan Müller   
 3        26214  20100300     Britta Haßelmann   
 4        26215  20100400  Dr. Marco Buschmann   
 5        26216  20100500     Stephan Brandner   
 6        26217  20100600            Jan Korte   
 7        26224  20101300     Wolfgang Kubicki   
 8        26219  20100800           Bärbel Bas   
 9        26220  20100900        Fabian Jacobi   
 10       26221  20101000         Aydan Özoğuz   
 11       26222  20101100        Yvonne Magwas   
 12       26223  20101200         Claudia Roth   
 13       26218  20100700   Dr. Rolf Mützenich   
 
                                                  text        date  \
 0   \n\nFrau Präsidentin, ich nehme die Wahl an un...  2021-10-26   
 1   \n\nSehr geehrter Herr Alterspräsident! So mus...  2021-10-26   
 2   \n\nHerr Präsident!

In [None]:
import pickle

def monthly_lda_per_party(num_of_topics=10):

    monthly_party_analysis = {}

    # Loop through each month
    for date, monthly_df in monthly_slices.items():
        print(f"Processing {date}...")

        # Dictionary to store results for each party in this month
        party_topics = {}

        party_dfs = {party: monthly_df[(monthly_df["party_clean"] == party) & (monthly_df["is_president"] == False)] for party in monthly_df["party_clean"].unique()}

        for party, party_df in party_dfs.items():
        # Tokenize the processed speech texts
            texts = [text.split() for text in party_df["processed_text"].dropna()]

            # Create a dictionary and corpus
            dictionary = corpora.Dictionary(texts)
            corpus = [dictionary.doc2bow(text) for text in texts]
            try:
                # Train LDA model for each party
                lda_model = LdaMulticore(corpus, num_topics=num_of_topics, id2word=dictionary, passes=50, random_state=1234)

                # Store the model
                party_topics[party] = lda_model.print_topics()
            except:
                print("Problem with party: " + str(party))

        # Print topics for each party
        for party, topics in party_topics.items():
            print(f"\n=== Topics for {party} and month {date} ===")
            for topic in topics:
                print(topic)

        # Store results for the month
        monthly_party_analysis[date] = party_topics

    print("Analysis completed!")

    filename = "results/monthly_lda_per_party" + str(num_of_topics)

    with open(filename, "wb") as fp:
        pickle.dump(topics, fp)

Processing 2021-10-26...

=== Topics for Die Linke and month 2021-10-26 ===
(0, '0.004*"müssen" + 0.004*"darüber" + 0.004*"gut" + 0.004*"Vorschlag" + 0.004*"ja" + 0.004*"Kollege" + 0.004*"mal" + 0.004*"wichtig" + 0.004*"FDP" + 0.004*"sagen"')
(1, '0.004*"gut" + 0.004*"müssen" + 0.004*"Vorschlag" + 0.004*"darüber" + 0.004*"FDP" + 0.004*"wichtig" + 0.004*"ja" + 0.004*"mal" + 0.004*"finden" + 0.004*"Kollege"')
(2, '0.037*"Wahl" + 0.037*"Demokratie" + 0.037*"Zusammenarbeit" + 0.037*"freuen" + 0.037*"nehmen" + 0.003*"müssen" + 0.003*"gut" + 0.003*"mal" + 0.003*"Kollege" + 0.003*"wichtig"')
(3, '0.004*"müssen" + 0.004*"gut" + 0.004*"darüber" + 0.004*"Vorschlag" + 0.004*"wichtig" + 0.004*"Fraktion" + 0.004*"sagen" + 0.004*"finden" + 0.004*"FDP" + 0.004*"Kollege"')
(4, '0.004*"müssen" + 0.004*"darüber" + 0.004*"gut" + 0.004*"Vorschlag" + 0.004*"FDP" + 0.004*"diskutieren" + 0.004*"mal" + 0.004*"ja" + 0.004*"Kollege" + 0.004*"finden"')
(5, '0.004*"gut" + 0.004*"darüber" + 0.004*"müssen" + 0.004*

In [None]:
monthly_lda_per_party(5)
monthly_lda_per_party(10)