# BERTopic Model Initial Evaluation

This notebook will be used so as to evaluate baseline BERTopic models with different styles of inputs and sentence transformers, so as to make the right pick when it comes to the final model of choice.

In [1]:
from bertopic import BERTopic
from models.bertopic.utils.data_loader import DataLoader
from sklearn.feature_extraction.text import CountVectorizer

from octis.evaluation_metrics.diversity_metrics import TopicDiversity, KLDivergence
from octis.evaluation_metrics.similarity_metrics import RBO, PairwiseJaccardSimilarity
from models.bertopic.utils.bertopic_evaluator import BERTopicModelEvaluator

from models.bertopic.config.embeddings import st_models
from models.bertopic.config.model import STOPWORDS, NUM_TOPICS, TOP_K

from umap import UMAP

## Data Loading

In [2]:
loader = DataLoader('data/data_speeches.csv', 'data/data_statements.csv')
loader.process()

train_docs, train_sentences = loader.get_train_data()
test_docs, test_sentences = loader.get_test_data()
val_docs, val_sentences = loader.get_val_data()

## Vectorizer and UMAP model initialization

In [3]:
vectorizer_model = CountVectorizer(stop_words=STOPWORDS,
                                   ngram_range=(1, 2))

## BERTopic model initialization

In [4]:
model_gr_docs = BERTopic(embedding_model=st_models['gr_stsb'], 
                         vectorizer_model=vectorizer_model,
                         nr_topics=NUM_TOPICS)

model_gr_media_docs = BERTopic(embedding_model=st_models['gr_media'], 
                               vectorizer_model=vectorizer_model,
                               n_gram_range=(1,2),
                               nr_topics=NUM_TOPICS)

model_gr_sentences = BERTopic(embedding_model=st_models['gr_stsb'], 
                              vectorizer_model=vectorizer_model,
                              nr_topics=NUM_TOPICS)

model_gr_media_sentences = BERTopic(embedding_model=st_models['gr_media'], 
                                    vectorizer_model=vectorizer_model,
                                    nr_topics=NUM_TOPICS)

model_multilingual_docs = BERTopic(embedding_model=st_models['multilingual'],
                                   vectorizer_model=vectorizer_model,
                                   nr_topics=NUM_TOPICS)

model_multilingual_sentences = BERTopic(embedding_model=st_models['multilingual'],
                                        vectorizer_model=vectorizer_model,
                                        nr_topics=NUM_TOPICS)

In [5]:
metrics = {
    'coherence_c_npmi': None,
    'coherence_c_v': None,
    'coherence_u_mass': None,
    'coherence_c_uci': None,
    'diversity_topic': TopicDiversity(topk=TOP_K),
    'similarity_rbo': RBO(topk=TOP_K),
    'similarity_pjs': PairwiseJaccardSimilarity(),
}

In [6]:
models = {
    'gr_docs': model_gr_docs,
    'gr_media_docs': model_gr_media_docs,
    'gr_sentences': model_gr_sentences,
    'gr_media_sentences': model_gr_media_sentences,
    'multilingual_docs': model_multilingual_docs,
    'multilingual_sentences': model_multilingual_sentences
}

In [7]:
datasets = {
    'docs': train_docs,
    # 'sentences': train_sentences
}

In [8]:
evaluator = BERTopicModelEvaluator(models=models, 
                                   metrics=metrics, 
                                   datasets=datasets,
                                   topics=NUM_TOPICS)

In [9]:
evaluator.evaluate()

Training model:  gr_media_docs
Model trained
Evaluating model:  gr_media_docs
Evaluating metric coherence_c_npmi for model gr_media_docs
Evaluating metric coherence_c_v for model gr_media_docs
Evaluating metric coherence_u_mass for model gr_media_docs
Evaluating metric coherence_c_uci for model gr_media_docs
Evaluating metric diversity_topic for model gr_media_docs
Evaluating metric similarity_rbo for model gr_media_docs
Evaluating metric similarity_pjs for model gr_media_docs
Model gr_media_docs evaluated
Exporting topics for model:  gr_media_docs


Unnamed: 0,model,coherence_c_npmi,coherence_c_v,coherence_u_mass,coherence_c_uci,diversity_topic,similarity_rbo,similarity_pjs,dataset
0,gr_media_docs,0.068853,0.674748,-0.350541,-0.884317,0.689655,0.079584,0.048949,docs
