# Finding best hyperparameters for BERTopic model

In this notebook we will try to find the best hyperparameters for our BERTopic model, by trying different configurations of UMAP and HDBSCAN models. Then we will evaluate each model based on both standard evaluation metrics and manual inspection of the topics created. 

In [None]:
from bertopic import BERTopic
from models.bertopic.utils.data_loader import DataLoader
from sklearn.feature_extraction.text import CountVectorizer

from models.bertopic.utils.bertopic_evaluator import BERTopicModelEvaluator

from models.bertopic.config.model import (STOPWORDS, NUM_TOPICS, TOP_K, EMBEDDING_MODEL, metrics,
                                          umap_configs_dc, hdbscan_configs_dc, umap_configs_sc, 
                                          hdbscan_configs_sc, dim_models, cluster_models) 

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
loader = DataLoader('data/data_speeches.csv', 'data/data_statements.csv')
loader.process()

train_docs, train_sentences = loader.get_train_data()
test_docs, test_sentences = loader.get_test_data()
val_docs, val_sentences = loader.get_val_data()

In [None]:
datasets = {
    'docs': train_docs,
    # 'sentences': train_sentences
}

In [None]:
vectorizer_model = CountVectorizer(stop_words=STOPWORDS,
                                   ngram_range=(1, 2))

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

Evaluating different UMAP and HDBSCAN configurations

In [None]:
models = {}

for umap_config in umap_configs_dc:
    for hdbscan_config in hdbscan_configs_dc:
        
        model_name = f"model_umap_{umap_config}_hdbscan_{hdbscan_config}_docs"
        
        umap_model = UMAP(**umap_config)
        hdbscan_model = HDBSCAN(**hdbscan_config, gen_min_span_tree=True, prediction_data=True)

        
        model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            embedding_model=EMBEDDING_MODEL,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            nr_topics=30,
            top_n_words=10,
            n_gram_range=(1,2)
        )
        
        models[model_name] = model

In [None]:
evaluator = BERTopicModelEvaluator(models=models, 
                                   metrics=metrics, 
                                   datasets=datasets,
                                   topics=NUM_TOPICS)

In [None]:
evaluator.evaluate()

Testing different models for dimensionality reduction and clustering

In [None]:
models = {}

for dim_model_name, dim_model in dim_models.items():
    for cluster_model_name, cluster_model in cluster_models.items():
        
        model_name = f"model_dim_{dim_model_name}_cluster_{cluster_model_name}_docs"
        
        
        model = BERTopic(
            umap_model=dim_model,
            hdbscan_model=cluster_model,
            embedding_model=EMBEDDING_MODEL,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            nr_topics=30,
            top_n_words=10,
            n_gram_range=(1,2)
        )
        
        models[model_name] = model

In [None]:
evaluator = BERTopicModelEvaluator(models=models, 
                                   metrics=metrics, 
                                   datasets=datasets,
                                   topics=NUM_TOPICS)

evaluator.evaluate()