In [None]:
!pip install bertopic

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.model_selection import train_test_split


def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    pd.set_option('mode.chained_assignment', None)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)


representation_model = KeyBERTInspired()
df = pd.read_csv('cleaned_data.csv')
text_docs = df.post_text.str.strip("'[]")
text_docs =text_docs.str.replace("'","").str.replace(",","").str.lstrip()


0                                                     NaN
1       Artist and designer Janos Stone is creating a ...
2       The inaugural meeting of Kickstarter’s Communi...
3       This 25-year-old man rushed into a burning hou...
4       An occult-themed JRPG with a fully integrated ...
                              ...                        
9522    This curated collection of Taiwanese teas come...
9523    A luxurious shower head that cleans better and...
9524                                                  NaN
9525                                                  NaN
9526    "If it’s not fun, it’s not worth doing."On mak...
Length: 9527, dtype: object

In [72]:
text_docs

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
9522    NaN
9523    NaN
9524    NaN
9525    NaN
9526    NaN
Length: 9527, dtype: object

In [2]:
text_docs = text_docs[text_docs != ''].reset_index(drop=True)

In [25]:
text_docs

0       preserve portrait possible service veteran sac...
1       stone unique mini recyclable truly age playhou...
2       feedback discussion frank inaugural meeting co...
3       25yearold aid recovery rush five verify burn p...
4       jrpg count fully deck integrate tarot occult e...
                              ...                        
8770    curate magazine print collection tea authentic...
8771    less 75 clean luxurious shower percent less 75...
8772    construct spare marble mason gold kit diy marb...
8773                  link 15 48 additional plus ultimate
8774    worth fun harness kevin magic microbe educatio...
Name: post_text, Length: 8775, dtype: object

In [50]:
topic_model = BERTopic(embedding_model="all-MiniLM-L12-v2", verbose=True,
                       min_topic_size = 75)
topics, probs = topic_model.fit_transform(text_docs)

2023-12-24 10:47:17,982 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/275 [00:00<?, ?it/s]

2023-12-24 10:47:25,691 - BERTopic - Embedding - Completed ✓
2023-12-24 10:47:25,697 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-24 10:47:41,152 - BERTopic - Dimensionality - Completed ✓
2023-12-24 10:47:41,154 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-24 10:47:41,709 - BERTopic - Cluster - Completed ✓
2023-12-24 10:47:41,717 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-24 10:47:41,935 - BERTopic - Representation - Completed ✓


In [51]:
topic_model.visualize_topics()

In [60]:
topic_distr, _ = topic_model.approximate_distribution(text_docs)

100%|██████████| 9/9 [00:01<00:00,  7.26it/s]


In [61]:
topic_model.get_document_info(text_docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,preserve portrait possible service veteran sac...,10,10_veteran_vet_wwii_war,"[veteran, vet, wwii, war, flag, pilot, militar...",[veteran tyler veteran 3000 8yearold nearly ve...,veteran - vet - wwii - war - flag - pilot - mi...,1.000000,False
1,stone unique mini recyclable truly age playhou...,0,0_musician_comic_strip_magazine_director,"[musician, comic_strip, magazine, director, pr...",[sign draft page cornerstone worth chance link...,musician - comic_strip - magazine - director -...,0.978060,False
2,feedback discussion frank inaugural meeting co...,0,0_musician_comic_strip_magazine_director,"[musician, comic_strip, magazine, director, pr...",[sign draft page cornerstone worth chance link...,musician - comic_strip - magazine - director -...,1.000000,False
3,25yearold aid recovery rush five verify burn p...,1,1_shooting_kill_rare_baby,"[shooting, kill, rare, baby, hurricane, diseas...",[diagnosis 2yearold rare 12 rare disease affec...,shooting - kill - rare - baby - hurricane - di...,0.995632,False
4,jrpg count fully deck integrate tarot occult e...,-1,-1_gaming_podcast_cat_instagram,"[gaming, podcast, cat, instagram, ig, differen...",[brand podcast listen season subscribe link tr...,gaming - podcast - cat - instagram - ig - diff...,0.000000,False
...,...,...,...,...,...,...,...,...
8770,curate magazine print collection tea authentic...,0,0_musician_comic_strip_magazine_director,"[musician, comic_strip, magazine, director, pr...",[sign draft page cornerstone worth chance link...,musician - comic_strip - magazine - director -...,0.989557,False
8771,less 75 clean luxurious shower percent less 75...,-1,-1_gaming_podcast_cat_instagram,"[gaming, podcast, cat, instagram, ig, differen...",[brand podcast listen season subscribe link tr...,gaming - podcast - cat - instagram - ig - diff...,0.000000,False
8772,construct spare marble mason gold kit diy marb...,-1,-1_gaming_podcast_cat_instagram,"[gaming, podcast, cat, instagram, ig, differen...",[brand podcast listen season subscribe link tr...,gaming - podcast - cat - instagram - ig - diff...,0.000000,False
8773,link 15 48 additional plus ultimate,6,6_earbud_speaker_headphone_bluetooth,"[earbud, speaker, headphone, bluetooth, audio,...",[earbud language translate shipping earbud tra...,earbud - speaker - headphone - bluetooth - aud...,0.305663,False


In [46]:
topic_model.probabilities_

array([1., 1., 1., ..., 0., 0., 0.])

In [47]:
topic_labels = topic_model.generate_topic_labels(nr_words=30,
                                            topic_prefix=True,
                                            word_length=30,
                                            separator=", ")

topic_idx = [x.split(',')[0] for x in topic_labels]
topic_words = [','.join(x.split(',')[1:]) for x in topic_labels]
topic_label_df= pd.DataFrame(columns=['topic_idx', 'topic_words'])
topic_label_df['topic_idx'] = topic_idx
topic_label_df['topic_words'] = topic_words
topic_label_df

Unnamed: 0,topic_idx,topic_words
0,-1,"robot, podcast, episode, ig, gaming, 3d, inst..."
1,0,"sign, stranger, christmas, walk, veteran, bab..."
2,1,"charger, smartphone, phone, solar, laptop, ba..."
3,2,"electric, ride, backpack, scooter, luggage, f..."
4,3,"earbud, speaker, headphone, bluetooth, audio,..."
5,4,"gym, workout, body, fitness, exercise, fat, t..."
6,5,"sleep, pillow, smartwatch, wake, wearable, im..."


In [8]:
topic_labels = topic_model_2.generate_topic_labels(nr_words=30,
                                            topic_prefix=True,
                                            word_length=30,
                                            separator=", ")

topic_idx = [x.split(',')[0] for x in topic_labels]
topic_words = [','.join(x.split(',')[1:]) for x in topic_labels]
topic_label_df= pd.DataFrame(columns=['topic_idx', 'topic_words'])
topic_label_df['topic_idx'] = topic_idx
topic_label_df['topic_words'] = topic_words
topic_label_df

Unnamed: 0,topic_idx,topic_words
0,-1,"print, toy, social, news, nonprofit, special,..."
1,0,"bootcamp, startup, ces2018, prelaunch, hardwa..."
2,1,"ride, segway, segwayninebot, segways, scooter..."
3,2,"tarot, cards, deck, card, divination, diverse..."
4,3,"earbud, bluetooth, wearbud, earphone, airpod,..."
5,4,"charger, charging, powerbank, batterys, recha..."
6,5,"comic_strip, comics, comic, manga, graphic, c..."
7,6,"lunch, debt, meal, lunchbox, hungry, hunger, ..."
8,7,"hurricane, harvey, irma, storm, houston, texa..."
9,8,"workout, fitness, exercise, trainer, gym, fit..."


In [None]:
topic_model.get_topic(16)

In [None]:
topic_model.visualize_hierarchy(top_n_topics=10)

In [12]:
topic_model.visualize_barchart(top_n_topics=10)

In [11]:
topic_model.visualize_heatmap(n_clusters=5
                              , width=500, height=500)

In [18]:
label_dict = {
-1: 'Social',
0:'Comics and Entertainement',
1:'Health',
2:'Environment friendly',
3:'Gadgets',
4:'Event',
5:'Non profit',
6:'Bikes',
7:'Audio related',
8:'Food',
9:'Fitness and Fitness Merch',
10:'Tech',
11: 'Games'
}

topic_model.set_topic_labels(label_dict)

In [21]:
topic_model.save('bertopics.pkl')



In [23]:
topic_model.visualize_hierarchy(top_n_topics=12)

In [32]:
topic_model.visualize_term_rank()

In [37]:
topic_model.probabilities_

array([0., 0., 0., ..., 0., 0., 0.])