In [None]:
# Import relevant general libraries.
import re
import datetime
import numpy as np
import pandas as pd

# Stopword list
from stop_words import get_stop_words

# Transformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

In [2]:
stop_words = get_stop_words('dutch')

# Append custom words to be included in the stopword list.
stop_words.append(['we', "tweede", "kamer", " aanhangsel","antwoord","vraag", "bent", "ingezonden", "groet", "verzonden", "emailprocedure", "onderwerp","verzoek"])

In [2]:
df = pd.read_csv('kamers_text_tagged.csv')

In [3]:
# Convert date column to Pandas date if not already in that format
df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)
df['date'] = pd.to_datetime(df['date']).dt.date

# Sort by date if not already.
df = df.sort_values(by='date')

# Remove rows containing empty date or text columns, if there are still any.
df = df[df['date'].notna()]
df = df[df['text'].notna()]

In [4]:
# Clean text in preparation for BERTopic tagging.

#replace \n, \t, and \r with space in text column
df['text'] = df['text'].astype(str)
df.text = df.text.str.replace('\n', ' ')
df.text = df.text.str.replace('\t', ' ')
df.text = df.text.str.replace('\r', ' ')

#remove extra spaces with one space
df.text = df.text.str.replace(' +', ' ')

#remove numbers from text column
df.text = df.apply(lambda row: re.sub(r'\d+', '', row.text), 1)


In [7]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 

In [8]:
# Convert 'text' column of the dataset to a list for easier use with BERTopic.
doc_texts = df.text.to_list()

In [None]:
# Define TFIDF-model which BERTopic uses, reduce frequent words.
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Define the BERTopic model.
topic_model = BERTopic(verbose=True, calculate_probabilities=True, language='multilingual', ctfidf_model=ctfidf_model)

# Fit model on textual data.
topics, probs = topic_model.fit_transform(doc_texts)

In [None]:
# Reduce outliers; BERTopic by default will sort a lot of documents in the 'outlier' category of -1.clear_output
# This function will turn those topics of value '-1' into any other probable non-negative topic, eliminating the heavy data loss.
# Downside is that affected documents might not be as accurate in their topic representation.
new_topics = topic_model.reduce_outliers(doc_texts, topics)

In [None]:
# Optionallh, update the topic_model.
# Useful if we want to use the topic_model further, for applications like drawing topics over time with built=in BERTopic functions.
# We only use BERTopic for tagging as a subprocess here, so the code is commented out.

# topic_model.update_topics(doc_texts, topics=new_topics)

In [None]:
# This code block prints the outliers (topic values of '-1') that we would have had without outlier reduction applied.

outlier_count = 0
for topic in topics:
    if topic == -1:
        outlier_count += 1



change_count = 0
for i in range(len(topics)):
    if topics[i] != new_topics[i]:
        change_count += 1

print("Number of '-1' topics:", outlier_count)
print("Number of elements that differ:", change_count)

In [11]:
# Now assign the updated topics to our original topics variable.
topics = new_topics.copy()

In [12]:
# Assign the topics to our original DataFrame.
for i, topic in enumerate(topics):
    document_id = df.iloc[i]['id']
    df.loc[df['id'] == document_id, 'topic_number'] = topic

In [None]:
# Convert the topic_number column data type from 'float' to 'int'.
df = df.astype({"topic_number":'int'})

In [None]:
# Retrieve topic words from the topic_model and assign them based on topic_number.
def get_topic_words(topic_nr):
    if topic_nr in topic_model.topic_representations_:
        return topic_model.topic_representations_[topic_nr]
    else:
        return ""

# Apply the function to retrieve topic words and add 'topic_words' column to the DataFrame
df['topic_words'] = df['topic_number'].apply(get_topic_words)

In [None]:
# Save our topic-tagged dataset.
df.to_csv("kamers_text_tagged_topics.csv")