# BERTopic Exploration of the Stimmmungs- und Lageberichte Database Files
**Author:** Christopher Thomas Goodwin

**Creation Date:** 2024.04.10

**Summary:** Uses BERTopic modelling to explore the data of the NSHWE Stimmungs- und Lageberichte files

In [None]:
from sys import platform
from bertopic import BERTopic
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import GPUtil

if len(GPUtil.getAvailable()) > 0:
    from cuml.cluster import HDBSCAN
    from cuml.manifold import UMAP
else:
    from umap import UMAP
    from hdbscan import HDBSCAN

In [None]:
if platform == "linux" or platform == "linux2":
    path = "/home/cgoodwin/PycharmProjects/NSHWEDatabaseMining/data/json/stimmungs_data_sentences.json"
elif platform == "darwin":
    path = "/Users/cgoodwin/Documents/Programming/Python/NSHWEDatabaseMining/data/json/stimmungs_data_sentences.json"
else:
    path = "C:\Users\Christopher Goodwin\Documents\Programming Projects\TextMiningNaziIdeology\data\json\stimmungs_data_sentences.json"
    
with open(path, "r", encoding="utf-8") as f:
    files = json.load(f)
    # files loaded in as dictionary with strings of 0... length of files
    
    # we want just the textual data, the report from each entry
    reports = []
    for i in range(len(files)):
        reports.append(files[str(i)]["report"]) # iterate through dictionary and append report

In [None]:
# set up vectorizer for German stopwords
german_stop_words = stopwords.words('german')
additional_stop_words = ["volk", "volksgemeinschaft", "1939", "1940", "1941", "1942", "1943", "1944", "1945", "deutsch", "bevölkerung", "ii", "iii", "iv", "v", "vi", "einzelmeldungen", "volksgenossen", "sei", "seien", "worden", "meldungen", "deutsche", "deutschen", "wegen", "wurde", "gif", "pro", "kg", "minusbox", "images", "rm"]

for i in range(0, 1946):
    additional_stop_words.append(str(i))

german_stop_words.extend(additional_stop_words)

vectorizer_model = CountVectorizer(stop_words=german_stop_words)

In [None]:
# check for GPU to run model faster
if len(GPUtil.getAvailable()) > 0:
    print("GPU engaged.")
    
    # create instances of GPU-accelerated UMAP and HDBSCAN
    umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L12-v2", vectorizer_model=vectorizer_model, language="multilingual", nr_topics=75, top_n_words=10, calculate_probabilities=True, verbose=True)
else:
    print("No GPU engaged.")
    
    umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    
    # use multilingual model and apply German stopwords vectorizer model
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L12-v2", vectorizer_model=vectorizer_model, language="multilingual", nr_topics=75, top_n_words=10, calculate_probabilities=True, verbose=True)

In [None]:
topics, probs = topic_model.fit_transform(reports)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.save("my_model", serialization=".safetensors")

# Apply TF-IDF to Model

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, stop_words=german_stop_words)
embeddings = tfidf_vectorizer.fit_transform(reports)

tfidf_model = BERTopic(nr_topics=75)
tfidf_topics, tfidf_probs = tfidf_model.fit(reports, embeddings)

In [None]:
tfidf_model.get_topic_info()