# BertTopic
In diesem Notebook trainieren wir die BERTTopic Modelle

In [2]:
import pandas as pd
from bertopic import BERTopic
import os
import spacy

  from .autonotebook import tqdm as notebook_tqdm
2023-02-06 06:47:31.676816: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Config

In [8]:
model_base_path = "/models" # Models werden extern abgelegt

# Daten Laden

In [5]:
## Import dataframe
filelocation = 'data/DataText'
data = pd.read_feather(filelocation)
data.head(1)

Unnamed: 0,participant_id,u_date,year,month,quarter,yearmonth,yearquarter,season,Kommentar,wime_personal,...,ft_zielort,Kommentar_Character,Kommentar_Tokens,Kommentar_Types,Kommentar_TTR,text_preprocessed,tokenized,lemmatized,lemmatized_no_loc,nouns
0,612374,2022-12-31,2022,12,4,2022-12-01,2022/4,winter,Häufigere Verbindungen zw. Bern-Luzern.,75.0,...,Luzern,39,4,4,100.0,häufigere verbindungen zw. bern-luzern.,"[häufigere, verbindungen, zw, ., bern-luzern, .]","[häufig, verbindung, zw, --, bern-luzern, --]","[häufig, verbindung, zw, --, bern-luzern, --]","[verbindungen, bern-luzern]"


# Training 

In [13]:
docs = data["text_preprocessed"].to_list()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


def fit_berttopic(target_dir:str, embedding_model=None, min_topic_size:int=50) -> None:
    """
        Trainiert und speichert BERTTopic Modell 
        https://maartengr.github.io/BERTopic/index.html

        :param target_dir: Speicherort für fertiges Modell
        :param embedding_model: Name des Embedding-Modells oder ein Modell
        :param min_topic_size: minimale Größe eines Topics (HDBSCAN Clusters)

    """
    german_stop_words = stopwords.words('german') # Stopwords für Keyword-Berechnung

    # CountVectorizer ist Standard aber extern definiert, um deutsche Stopwords zu nutzen
    vectorizer = CountVectorizer(stop_words=german_stop_words) 
    model = BERTopic(
        language="german",
        vectorizer_model=vectorizer,
        embedding_model=embedding_model,
        min_topic_size=min_topic_size)

    topics, probs = model.fit_transform(docs)

    model.save(target_dir)

def fit_berttopic_if_not_exists(target_dir, embedding_model=None, min_topic_size:int=50) -> None:
    """
        Wrapper für fit_berttopic damit bereits trainierte Modelle nicht erneut trainiert werden
    """

    if os.path.exists(target_dir):
        print("Model already trained")
        return
    else:
        print(f"Fitting Model {embedding_model}...")
        fit_berttopic(target_dir=target_dir, embedding_model=embedding_model, min_topic_size=min_topic_size)

In [16]:
# Topic Modell Erstellung auf Basis des BERT-Embeddings
fit_berttopic_if_not_exists(model_base_path + "/BERTTopic.model", min_topic_size=50)

Fitting Model None...


Downloading: 100%|██████████| 471M/471M [06:53<00:00, 1.14MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 22.8kB/s]
Downloading: 100%|██████████| 5.07M/5.07M [00:04<00:00, 1.18MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 105kB/s]
Downloading: 100%|██████████| 9.08M/9.08M [00:07<00:00, 1.18MB/s]
Downloading: 100%|██████████| 480/480 [00:00<00:00, 255kB/s]
Downloading: 100%|██████████| 14.8M/14.8M [00:12<00:00, 1.16MB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 99.9kB/s]


: 

: 

In [None]:
# Topic Modell Erstellung auf Basis des Spacy-Embeddings
nlp = spacy.load('de_core_news_sm', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

# Modell Training
fit_berttopic_if_not_exists(model_base_path + "/BERTTopic_spacy.model",embedding_model=nlp, min_topic_size=50)

# Aggregierte Topics
Hier aggregieren die Anzahl der Topics auf 10, um einen Überblick zu bekommen.
- Dabei wird iterativ das kleinste Topic mit dem ähnlichsten zusammengefügt.

In [None]:
if not os.path.exists(model_base_path + "/BERTTopic_aggregated_10.model"):
    model = BERTopic.load(model_base_path + "/BERTTopic.model")
    model.reduce_topics(docs=docs, nr_topics=10)
    model.save(model_base_path + "/BERTTopic_aggregated_10.model")