In [1]:
# importing packages

# Basic python packages
import pandas as pd
import numpy as np
import bertopic
import os

# BERTopic related
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Transformers packages
from sentence_transformers import SentenceTransformer
import transformers
# handle parallelism for tokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# SpaCy
import spacy
from spacy.lang.da import Danish

# viz packages
import matplotlib.pyplot as plt

In [5]:
# importing Spacy Stopwords
nlp = Danish()
stop_words = list(nlp.Defaults.stop_words)
stop_words.extend(['originalartiklen', 'originalartikel',
'ea670633','Originalartiklen',
'øh','øhm',
'sådan','ehm',
'æhm', 'Æhm',
'Ehm', 'Sådan'])

In [6]:
# Loading data
mepsda_df = pd.read_csv('/work/Ccp-MePSDA/output/collected_data/mepsda_df.csv')
#mepsda_df.drop(columns='index', inplace=True)
mepsda_df['chunked'] = mepsda_df['chunked'].astype(str)

# aggregating each text under the same title
#mepsda_df = mepsda_df.groupby(['title', 'source']).agg({'text':'\n'.join})

In [10]:
# Defining embedding model
embedding_model = SentenceTransformer('Maltehb/danish-bert-botxo')
# Loading pre-trained embeddings
embeddings = np.load('/work/Ccp-MePSDA/modelling/embeddings/embeddings.npy')

# define Umap cluster parameters
umap_model = UMAP(n_neighbors=3,
n_components=23,
metric='cosine',
min_dist=0.04,
low_memory=False,
random_state=420)

# defining hierarchical density based clustering model
hdbscan_model = HDBSCAN(min_cluster_size=95,
cluster_selection_method='leaf',
metric='euclidean',
prediction_data=True)

# Define representation model
representation_model = KeyBERTInspired()

# Define CountVectorizer model
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, max_df=0.85, ngram_range=(1, 2))

# Iniate model
topic_model = BERTopic(umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
embedding_model=embedding_model,
vectorizer_model=vectorizer_model,
top_n_words=20,
verbose=True)

# run model on text column
topics, probs = topic_model.fit_transform(mepsda_df['chunked'], embeddings)

No sentence-transformers model found with name Maltehb/danish-bert-botxo. Creating a new one with mean pooling.
2024-11-28 15:32:32,870 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-28 15:34:23,637 - BERTopic - Dimensionality - Completed ✓
2024-11-28 15:34:23,647 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-28 15:35:47,284 - BERTopic - Cluster - Completed ✓
2024-11-28 15:35:47,306 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-28 15:44:35,356 - BERTopic - Representation - Completed ✓


In [12]:
topic_model.save("/work/Ccp-MePSDA/modelling/model/mepsda_bertopic", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)