# BertTopic
In diesem Notebook trainieren wir die BERTTopic Modelle

In [15]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file

from functions import *
import pandas as pd
from bertopic import BERTopic
from spacy.lang.de.stop_words import STOP_WORDS

# Config

In [16]:
## Import dataframe
filelocation = '../../data/DataClean'
df = pd.read_feather(filelocation)

# import custom stopwords list
customstopwords = pd.read_excel('../../config/customstopwords.xlsx')
customstopwords = customstopwords['stopword'].tolist()

customstopwords.extend(STOP_WORDS) # also add spacy stopwords

# Also add ortsnamen to the stoplist because we have them in the metadata and dont want them in the comments
orte = [x.lower() for x in set(df.ft_startort.tolist()) if x == x and x.lower() != '']

# Create the list of locations
for location in df.ft_startort.tolist():
    # Check if the value is a string
    if isinstance(location, str):
        # Convert to lowercase and remove 'Zug'
        location = location.lower()
        if location == 'zug':
            continue
        
        # Split the location into tokens if it contains whitespace
        tokens = location.split()
        
        # Add each token to the list individually
        for token in tokens:
            # Skip any token that is in the stoplist
            if token in orte:
                continue
            # Remove any commas from the end of the token
            token = token.rstrip(',')
            orte.append(token)
    
# Remove duplicates from the list
orte = list(set(orte))

orte.remove("zug")

# extend the stopword list with the ortsnamen
customstopwords.extend(orte)
customstopwords = list(dict.fromkeys(customstopwords)) #remove potenzial duplicates

# Daten Laden

In [17]:
## Import dataframe
filelocation = '../../data/DataTextTrain'
data = pd.read_feather(filelocation)

# Training 

In [18]:
docs = data["Kommentar"].to_list()

In [19]:
# Global Parameters for all models
min_topic_size=50
stop_words=customstopwords
modelpath = "../../models/"

In [11]:
# Train Model 1: High Speed / Small Size / Lower Benchmark Scores
modelname = 'paraphrase-MiniLM-L3-v2'
fit_berttopic_if_not_exists(modelpath+"BERTTopic_" + modelname + ".model",docs=docs,embedding_model=modelname, min_topic_size=min_topic_size,stop_words=stop_words)

In [20]:
# Train Model 2: High Speed / Medium Size / Medium Benchmark Scores
modelname = 'all-MiniLM-L6-v2'
fit_berttopic_if_not_exists(modelpath+"BERTTopic_" + modelname + ".model",docs=docs,embedding_model=modelname, min_topic_size=min_topic_size,stop_words=stop_words)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)


In [13]:
# Train Model 3: Low Speed / High Size / Medium-High Benchmark Scores
modelname = 'all-MiniLM-L12-v2'
fit_berttopic_if_not_exists(modelpath+"BERTTopic_" + modelname + ".model",docs=docs,embedding_model=modelname, min_topic_size=min_topic_size,stop_words=stop_words)

In [14]:
# Train Model 4: Low Speed / High Size / High Benchmark Scores (02.2023)
modelname = 'all-distilroberta-v1'
fit_berttopic_if_not_exists(modelpath+"BERTTopic_" + modelname + ".model",docs=docs,embedding_model=modelname, min_topic_size=min_topic_size,stop_words=stop_words)

In [15]:
# Train Model 5: Low Speed / High Size / High Benchmark Scores (02.2023)
fit_berttopic_if_not_exists(modelpath+"BERTTopic_default.model",docs=docs, min_topic_size=min_topic_size,stop_words=stop_words)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  self._set_arrayXarray(i, j, x)
