In [1]:
from bertopic import BERTopic
import statistics
import numpy as np
import json

# First we import the files
our_flyers_path = 'flyers_text.json'

with open(our_flyers_path, 'r') as file:
    json_file = json.load(file)

# A section
#a = json_file["(Langdurig) gebroken vliezen"]['Check uw dossier op MijnZGT']

# The list of sections to analyze is the list of values of each value of the original dictionary (json file)
sections_dict = {} # The dictionary is to avoid repeated elements
saw_sections = []
counter = -1
for document in json_file.values(): 
    counter += 1
    for section in document.values(): 
        if section not in saw_sections:
            sections_dict[section] = counter
            saw_sections.append(section)

'''
SOME OF THE SECTIONS ARE REPEATED, SO THERE'S LITTLE POINT ON KEEPING THEM
'''
sections = list(sections_dict.keys())
document_id_list = list(sections_dict.values())

print("Quantity of sections:", len(sections))

2024-01-06 13:39:13.879993: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-06 13:39:13.979683: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-06 13:39:14.581822: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-01-06 13:39:14.581908

Quantity of sections: 11827


In [None]:
from bertopic.backend import BaseEmbedder
from sentence_transformers import SentenceTransformer
from umap import UMAP

class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)
        return embeddings 
    
def build_models(custom_embedder, top_n_words, min_topic_size, dimensions, def_top, def_size, def_dim): # The three inputs will be lists
    
    counter = 1
    for parameter in top_n_words:

        umap_model = UMAP(n_neighbors=15, n_components=def_dim, min_dist=0.0, metric='cosine')

        topic_model = BERTopic(embedding_model=custom_embedder, umap_model=umap_model, top_n_words=parameter, min_topic_size=def_size, language="multilingual", calculate_probabilities=True, verbose=True)
        topics, probs = topic_model.fit_transform(sections)

        topic_model.save("models/top_n_words_model_"+str(counter))
        
        counter += 1
        
    counter = 1
    for parameter in min_topic_size:

        umap_model = UMAP(n_neighbors=15, n_components=def_dim, min_dist=0.0, metric='cosine')

        topic_model = BERTopic(embedding_model=custom_embedder, umap_model=umap_model, top_n_words=def_top, min_topic_size=parameter, language="multilingual", calculate_probabilities=True, verbose=True)
        topics, probs = topic_model.fit_transform(sections)

        topic_model.save("models/min_topic_size_model_"+str(counter))
        
        counter += 1
        
    counter = 1
    for parameter in dimensions:

        umap_model = UMAP(n_neighbors=15, n_components=parameter, min_dist=0.0, metric='cosine')

        topic_model = BERTopic(embedding_model=custom_embedder, umap_model=umap_model, top_n_words=def_top, min_topic_size=def_size, language="multilingual", calculate_probabilities=True, verbose=True)
        topics, probs = topic_model.fit_transform(sections)

        topic_model.save("models/reduction_dim_model_"+str(counter))
        
        counter += 1
    
# Create custom backend
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
custom_embedder = CustomEmbedder(embedding_model=embedding_model)

top_n_words = [5, 10, 15, 20, 25, 30]
def_top = 10
min_topic_size = [10, 15, 20, 25, 30, 35, 40, 45, 50]
def_size = 10
dimensions = [5, 10, 20, 30, 40, 50, 60, 70, 80]
def_dim = 5
build_models(custom_embedder, top_n_words, min_topic_size, dimensions, def_top, def_size, def_dim)

2024-01-06 13:54:15,144 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 13:55:34,756 - BERTopic - Embedding - Completed ✓
2024-01-06 13:55:34,757 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 13:56:05,173 - BERTopic - Dimensionality - Completed ✓
2024-01-06 13:56:05,177 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 13:56:21,010 - BERTopic - Cluster - Completed ✓
2024-01-06 13:56:21,020 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 13:56:22,108 - BERTopic - Representation - Completed ✓
2024-01-06 13:56:35,525 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 13:57:45,222 - BERTopic - Embedding - Completed ✓
2024-01-06 13:57:45,224 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 13:57:58,611 - BERTopic - Dimensionality - Completed ✓
2024-01-06 13:57:58,614 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 13:58:15,862 - BERTopic - Cluster - Completed ✓
2024-01-06 13:58:15,870 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 13:58:16,976 - BERTopic - Representation - Completed ✓
2024-01-06 13:58:20,234 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 13:59:34,417 - BERTopic - Embedding - Completed ✓
2024-01-06 13:59:34,419 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 13:59:45,574 - BERTopic - Dimensionality - Completed ✓
2024-01-06 13:59:45,578 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:00:02,710 - BERTopic - Cluster - Completed ✓
2024-01-06 14:00:02,716 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:00:03,841 - BERTopic - Representation - Completed ✓
2024-01-06 14:00:07,339 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:01:21,578 - BERTopic - Embedding - Completed ✓
2024-01-06 14:01:21,580 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:01:33,393 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:01:33,396 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:01:47,235 - BERTopic - Cluster - Completed ✓
2024-01-06 14:01:47,243 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:01:48,326 - BERTopic - Representation - Completed ✓
2024-01-06 14:01:51,547 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:03:05,686 - BERTopic - Embedding - Completed ✓
2024-01-06 14:03:05,688 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:03:17,339 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:03:17,342 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:03:33,868 - BERTopic - Cluster - Completed ✓
2024-01-06 14:03:33,874 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:03:34,960 - BERTopic - Representation - Completed ✓
2024-01-06 14:03:38,269 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:04:52,455 - BERTopic - Embedding - Completed ✓
2024-01-06 14:04:52,457 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:05:05,144 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:05:05,147 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:05:21,171 - BERTopic - Cluster - Completed ✓
2024-01-06 14:05:21,178 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:05:22,264 - BERTopic - Representation - Completed ✓
2024-01-06 14:05:25,844 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:06:39,877 - BERTopic - Embedding - Completed ✓
2024-01-06 14:06:39,879 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:06:51,405 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:06:51,407 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:07:07,764 - BERTopic - Cluster - Completed ✓
2024-01-06 14:07:07,771 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:07:08,843 - BERTopic - Representation - Completed ✓
2024-01-06 14:07:11,996 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:08:25,932 - BERTopic - Embedding - Completed ✓
2024-01-06 14:08:25,934 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:08:39,773 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:08:39,776 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:08:45,673 - BERTopic - Cluster - Completed ✓
2024-01-06 14:08:45,679 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:08:46,721 - BERTopic - Representation - Completed ✓
2024-01-06 14:08:49,825 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:10:03,637 - BERTopic - Embedding - Completed ✓
2024-01-06 14:10:03,639 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:10:16,140 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:10:16,142 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:10:18,907 - BERTopic - Cluster - Completed ✓
2024-01-06 14:10:18,913 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:10:19,904 - BERTopic - Representation - Completed ✓
2024-01-06 14:10:22,914 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:11:36,902 - BERTopic - Embedding - Completed ✓
2024-01-06 14:11:36,904 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:11:49,248 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:11:49,249 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:11:51,499 - BERTopic - Cluster - Completed ✓
2024-01-06 14:11:51,505 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:11:52,522 - BERTopic - Representation - Completed ✓
2024-01-06 14:11:54,985 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:13:08,766 - BERTopic - Embedding - Completed ✓
2024-01-06 14:13:08,768 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:13:20,997 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:13:21,000 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:13:22,613 - BERTopic - Cluster - Completed ✓
2024-01-06 14:13:22,619 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:13:23,596 - BERTopic - Representation - Completed ✓
2024-01-06 14:13:26,022 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:14:39,920 - BERTopic - Embedding - Completed ✓
2024-01-06 14:14:39,922 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:14:51,697 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:14:51,699 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:14:53,077 - BERTopic - Cluster - Completed ✓
2024-01-06 14:14:53,083 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:14:54,044 - BERTopic - Representation - Completed ✓
2024-01-06 14:14:56,765 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:16:10,687 - BERTopic - Embedding - Completed ✓
2024-01-06 14:16:10,689 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:16:22,840 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:16:22,842 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:16:24,078 - BERTopic - Cluster - Completed ✓
2024-01-06 14:16:24,083 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:16:25,037 - BERTopic - Representation - Completed ✓
2024-01-06 14:16:27,430 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:17:35,906 - BERTopic - Embedding - Completed ✓
2024-01-06 14:17:35,909 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:17:48,484 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:17:48,486 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:17:49,684 - BERTopic - Cluster - Completed ✓
2024-01-06 14:17:49,689 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:17:50,651 - BERTopic - Representation - Completed ✓
2024-01-06 14:17:52,983 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:19:06,792 - BERTopic - Embedding - Completed ✓
2024-01-06 14:19:06,794 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:19:19,486 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:19:19,487 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:19:20,689 - BERTopic - Cluster - Completed ✓
2024-01-06 14:19:20,694 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:19:21,638 - BERTopic - Representation - Completed ✓
2024-01-06 14:19:24,363 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:20:38,187 - BERTopic - Embedding - Completed ✓
2024-01-06 14:20:38,189 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:20:49,824 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:20:49,826 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:21:06,148 - BERTopic - Cluster - Completed ✓
2024-01-06 14:21:06,155 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:21:07,244 - BERTopic - Representation - Completed ✓
2024-01-06 14:21:10,415 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:22:24,366 - BERTopic - Embedding - Completed ✓
2024-01-06 14:22:24,367 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:22:35,773 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:22:35,776 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:22:53,429 - BERTopic - Cluster - Completed ✓
2024-01-06 14:22:53,436 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:22:54,547 - BERTopic - Representation - Completed ✓
2024-01-06 14:22:57,664 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:24:11,638 - BERTopic - Embedding - Completed ✓
2024-01-06 14:24:11,641 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:24:22,768 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:24:22,771 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:24:39,859 - BERTopic - Cluster - Completed ✓
2024-01-06 14:24:39,868 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:24:40,992 - BERTopic - Representation - Completed ✓
2024-01-06 14:24:44,504 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:25:58,496 - BERTopic - Embedding - Completed ✓
2024-01-06 14:25:58,499 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:26:11,383 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:26:11,386 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:26:27,735 - BERTopic - Cluster - Completed ✓
2024-01-06 14:26:27,742 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:26:28,834 - BERTopic - Representation - Completed ✓
2024-01-06 14:26:31,940 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:27:41,999 - BERTopic - Embedding - Completed ✓
2024-01-06 14:27:42,001 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:27:54,320 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:27:54,323 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:28:11,763 - BERTopic - Cluster - Completed ✓
2024-01-06 14:28:11,771 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:28:12,844 - BERTopic - Representation - Completed ✓
2024-01-06 14:28:16,269 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]

2024-01-06 14:29:30,306 - BERTopic - Embedding - Completed ✓
2024-01-06 14:29:30,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-06 14:29:42,422 - BERTopic - Dimensionality - Completed ✓
2024-01-06 14:29:42,426 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-06 14:29:59,664 - BERTopic - Cluster - Completed ✓
2024-01-06 14:29:59,671 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-06 14:30:00,766 - BERTopic - Representation - Completed ✓
2024-01-06 14:30:03,780 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/370 [00:00<?, ?it/s]