In [1]:
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
import os
from os.path import join
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
stopwords_nl = stopwords.words('dutch')
stopwords_nl = stopwords_nl + ['2016','2017','2018','2019','2020','2021','2022','50','we','ten','opzichte','daardoor','for','verder','ervoor','daardoor','per','vooral']

In [9]:
OUTPUT_DATA_PATH = join('..','data','output','')
MODEL_PATH = join('..','data','models','bertopic','')

### Own data

In [10]:
#Import sentences
df = pd.read_excel(OUTPUT_DATA_PATH + 'df_results_mvb_big_prediction.xlsx',index_col=0)
df = df[df.prediction==1]

In [11]:
docs = []
for sen in df['dnb_nlp:sentence']:
    docs.append(sen)

In [None]:
# The same embedding model needs to be used for both topic models
# and since we are dealing with multiple languages, the model needs to be multi-lingual
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# To make this example reproducible
umap_model = UMAP(n_neighbors=200, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)

# Dutch
vectorizer_model = CountVectorizer(stop_words=stopwords_nl)
topic_model = BERTopic(embedding_model=sentence_model,umap_model=umap_model,vectorizer_model=vectorizer_model,min_topic_size=25)
topics, probs = topic_model.fit_transform(docs)

topic_model.save(MODEL_PATH + "bertopic_model_mvb") #save topic model
topic_model.get_topic_info()

In [None]:
topic_model.reduce_topics(docs, topics, nr_topics=50)
topic_model.save(MODEL_PATH + "bertopic_model_mvb") #save topic model
topic_model.get_topic_info()