In [13]:
from sklearn.datasets import fetch_20newsgroups
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

slice_docs=docs[:5000]

# setting Models
embedding_model= SentenceTransformer('all-mpnet-base-v2')
vectorizer_model = CountVectorizer(stop_words="english")
representation_model = KeyBERTInspired()

umap_model = UMAP(  n_neighbors=10,
                    n_components=5,
                    metric='cosine')


hdbscan_model = HDBSCAN(min_cluster_size=10,
                        metric='euclidean',
                        prediction_data=True)

topic_model = BERTopic(embedding_model=embedding_model,
                       vectorizer_model=vectorizer_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       representation_model=representation_model,
                       language="english",
                       top_n_words=10,
                       n_gram_range=(1, 2),
                       nr_topics="auto",
                       verbose=True)

topics, probs = topic_model.fit_transform(slice_docs)

Batches:   1%|          | 1/157 [14:27<37:35:35, 867.53s/it]


KeyboardInterrupt: 

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_barchart()

In [None]:
similar_topics, similarity = topic_model.find_topics("Leader", top_n = 3)

In [None]:
print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[0])))
print("Similarity Score: {}".format(similarity[0]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[1])))
print("Similarity Score: {}".format(similarity[1]))

print("\n Most Similar Topic Info: \n{}".format(topic_model.get_topic(similar_topics[2])))
print("Similarity Score: {}".format(similarity[2]))