# BERTopic Exploration of the Stimmmungs- und Lageberichte Database Files
**Author:** Christopher Thomas Goodwin

**Creation Date:** 2024.04.10

**Summary:** Uses BERTopic modelling to explore the data of the NSHWE Stimmungs- und Lageberichte files

In [57]:
from bertopic import BERTopic
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import GPUtil
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cgoodwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
with open("/Users/cgoodwin/Documents/Programming/Python/NSHWEDatabaseMining/data/json/stimmungs_data_sentences.json", "r", encoding="utf-8") as f:
    files = json.load(f)
    # files loaded in as dictionary with strings of 0... length of files
    
    # we want just the textual data, the report from each entry
    reports = []
    for i in range(len(files)):
        reports.append(files[str(i)]["report"]) # iterate through dictionary and append report

In [60]:
# set up vectorizer for German stopwords
german_stop_words = stopwords.words('german')
vectorizer_model = CountVectorizer(stop_words=german_stop_words)

In [63]:
# check for GPU to run model faster
if len(GPUtil.getAvailable()) > 0:
    print("GPU engaged.")
    
    # create instances of GPU-accelerated UMAP and HDBSCAN
    umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L6-v2", vectorizer_model=vectorizer_model, language="multilingual")
else:
    print("No GPU engaged.")
    
    # use multilingual model and apply German stopwords vectorizer model
    topic_model = BERTopic(embedding_model="paraphrase-multilingual-MiniLM-L12-v2", vectorizer_model=vectorizer_model, language="multilingual", nr_topics=50)

No GPU engaged.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
topics, probs = topic_model.fit_transform(reports)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(44)