# BERTopic Exploration of the Stimmmungs- und Lageberichte Database Files
**Author:** Christopher Thomas Goodwin

**Creation Date:** 2024.04.10

**Summary:** Uses BERTopic modelling to explore the data of the NSHWE Stimmungs- und Lageberichte files

In [None]:
from bertopic import BERTopic
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import GPUtil

if len(GPUtil.getAvailable()) > 0:
    from cuml.cluster import HDBSCAN
    from cuml.manifold import UMAP
else:
    from umap import UMAP
    from hdbscan import HDBSCAN

In [None]:
with open("/Users/cgoodwin/Documents/Programming/Python/NSHWEDatabaseMining/data/json/stimmungs_data_sentences.json", "r", encoding="utf-8") as f:
    files = json.load(f)
    # files loaded in as dictionary with strings of 0... length of files
    
    # we want just the textual data, the report from each entry
    reports = []
    for i in range(len(files)):
        reports.append(files[str(i)]["report"]) # iterate through dictionary and append report

In [None]:
# set up vectorizer for German stopwords
german_stop_words = stopwords.words('german')
vectorizer_model = CountVectorizer(stop_words=german_stop_words)

In [None]:
# check for GPU to run model faster
if len(GPUtil.getAvailable()) > 0:
    print("GPU engaged.")
    
    # create instances of GPU-accelerated UMAP and HDBSCAN
    umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L6-v2", vectorizer_model=vectorizer_model, language="multilingual")
else:
    print("No GPU engaged.")
    
    umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.0)
    hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)
    
    # use multilingual model and apply German stopwords vectorizer model
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model="paraphrase-multilingual-MiniLM-L12-v2", vectorizer_model=vectorizer_model, language="multilingual", nr_topics=50, top_n_words=5, calculate_probabilities=True, verbose=True)

In [None]:
topics, probs = topic_model.fit_transform(reports)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(2)

In [None]:
topic_model.visualize_barchart()