# BERTopic Exploration of the Stimmmungs- und Lageberichte Database Files
**Author:** Christopher Thomas Goodwin

**Creation Date:** 2024.04.10

**Summary:** Uses BERTopic modelling to explore the data of the NSHWE Stimmungs- und Lageberichte files

In [None]:
import platform
from bertopic import BERTopic
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

# Get stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Check if GPU acceleration is available and call appropriate libraries
import GPUtil

if len(GPUtil.getAvailable()) > 0:
    from cuml.cluster import HDBSCAN
    #from cuml.manifold import UMAP # GPU-based version of UMAP
    from umap import UMAP # use CPU-based version of UMAP which is better for noisy or duplicate data
    print("GPU engaged.")

    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Current GPU device index: {torch.cuda.current_device()}")
    print(f"Current GPU device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    print(f"Pytorch Cuda version: {torch.version.cuda}")
else:
    from umap import UMAP
    from hdbscan import HDBSCAN
    print("No GPU engaged.")

In [None]:
# Check which platform user is on and set the data path accordingly
print(f"The Operating System is {platform.system()}")

if platform.system() == "Linux":
    path = "/home/cgoodwin/Documents/Programming/TextMiningNaziIdeology/data/json/stimmungs_data_sentences.json"
elif platform.system() == "Darwin":
    path = "/Users/cgoodwin/Programming Projects/TextMiningNaziIdeology/data/json/stimmungs_data_sentences.json"
else:
    path = "C:\\Users\\Christopher Goodwin\\Documents\\Programming Projects\\TextMiningNaziIdeology\\data\\json\\stimmungs_data_sentences.json"
    
with open(path, "r", encoding="utf-8") as f:
    files = json.load(f)
    # files loaded in as dictionary with strings of 0... length of files
    
    # we want just the textual data, the report from each entry
    reports = []
    for i in range(len(files)):
        reports.append(files[str(i)]["report"]) # iterate through dictionary and append report

print("File loaded.")

In [None]:
# set up vectorizer for German stopwords
german_stop_words = stopwords.words('german')
additional_stop_words = ["volk", "volksgemeinschaft", "1939", "1940", "1941", "1942", "1943", "1944", "1945", "deutsch", "bevölkerung", "ii", "iii", "iv", "v", "vi", "einzelmeldungen", "volksgenossen", "sei", "seien", "worden", "meldungen", "deutsche", "deutschen", "wegen", "wurde", "gif", "pro", "kg", "minusbox", "images", "rm"]

for i in range(0, 1946):
    additional_stop_words.append(str(i))

german_stop_words.extend(additional_stop_words)

vectorizer_model = CountVectorizer(stop_words=german_stop_words)

In [None]:
 # Adjust UMAP and HDBSCAN parameters
umap_model = UMAP(n_components=5, n_neighbors=10, min_dist=0.2)
hdbscan_model = HDBSCAN(min_samples=5, min_cluster_size=5, prediction_data=True)

# Initialize BERTopic with adjusted models
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L12-v2", language="multilingual", vectorizer_model=vectorizer_model, verbose=True, nr_topics=15, top_n_words=10)

In [None]:
topics, probs = topic_model.fit_transform(reports)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.save("my_model", serialization=".safetensors")

# Generative Labeling

In [None]:
import requests

def query_ollama(prompt, model="gemma3:12b", temperature=0.3): # options: gemma3:12b; qwen3:30b; deepseek-r1:14b; gpt-oss:20b
    url = "http://localhost:11434/api/generate"
    response = requests.post(url, json={
        "model": model,
        "prompt": prompt,
        "temperature": temperature,
        "stream": False
    })
    
    return response.json()['response'].strip()

top_topic_ids = topic_model.get_topic_info().head(10)['Topic'].tolist()

topic_keywords = {topic_id: topic_model.get_topic(topic_id) for topic_id in top_topic_ids}

custom_labels = {}


for topic_id, keywords in topic_keywords.items():
    words = ', '.join([word for word, _ in keywords])
    prompt = f"Give these keywords: {words}, generate a short, descriptive topic label that summarizes the theme."
    label = query_ollama(prompt)
    custom_labels[topic_id] = label
    print(f"Topic {topic_id}: {label}")
    
# Copy existing labels
topic_model.custom_labels_ = topic_model.get_topic_info()['Name'].tolist()

# Replace with new ones

for topic_id, label in custom_labels.items():
    if topic_id < len(topic_model.custom_labels_):
        topic_model.custom_labels_[topic_id] = label
        
topic_model.visualize_topics()

# Apply TF-IDF to Model

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, stop_words=german_stop_words)
embeddings = tfidf_vectorizer.fit_transform(reports)

tfidf_model = BERTopic(nr_topics=75)
tfidf_topics, tfidf_probs = tfidf_model.fit(reports, embeddings)

In [None]:
tfidf_model.get_topic_info()