In [158]:
import pandas as pd 
import numpy as np
import re
from nltk.tokenize import TweetTokenizer
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from bertopic.representation import OpenAI
import openai
from sentence_transformers import SentenceTransformer
import voyageai

In [159]:
#rcParams default settings
"https://matplotlib.org/stable/tutorials/introductory/customizing.html"

#rcParams settings
plt.style.use('ggplot')

rcParams['font.family'] = 'sans-serif'
rcParams['font.style'] = 'normal'

rcParams['figure.facecolor'] = 'white'

rcParams['savefig.bbox'] = 'tight'
rcParams['savefig.dpi'] = 300
rcParams['savefig.transparent'] = True

rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
rcParams['axes.labelsize'] = 20
rcParams['axes.labelcolor'] = 'black'
rcParams['axes.edgecolor'] = 'grey'
rcParams['axes.linewidth'] = 3
rcParams['axes.facecolor'] = 'white'
rcParams['axes.titlepad'] = 4

rcParams['xtick.color'] = 'grey'
rcParams['ytick.color'] = 'grey'
rcParams['xtick.major.width'] = 2
rcParams['ytick.major.width'] = 0
rcParams['xtick.major.size'] = 5
rcParams['ytick.major.size'] = 0

rcParams['lines.linewidth'] = 3
rcParams['lines.markersize'] = 10

rcParams['grid.color'] = 'grey'
rcParams['grid.linewidth'] = 0.1

## Topic Modelling Pipeline

In [174]:
df = pd.read_csv('cleaned_texts.csv', index_col=[0])

In [175]:
df = df.loc[~df.cleaned_text.isna()]

#### Text extraction

In [176]:
docs = df.cleaned_text.to_list()

#### Training procedure

In [177]:
zeroshot_topic_list= ['energy and climate and sustainability', 'transport and logistic', 'health', 'customer service', 'celebrations and joy']


In [191]:
model = BERTopic(
  zeroshot_topic_list=zeroshot_topic_list,
  zeroshot_min_similarity=.45,
  min_topic_size = 50,
  nr_topics = 21,
  verbose=True
)

In [192]:
# Train model
topics, probs = model.fit_transform(docs)

2024-05-29 21:44:29,876 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/322 [00:00<?, ?it/s]

2024-05-30 11:36:04,269 - BERTopic - Embedding - Completed ✓
2024-05-30 11:36:04,270 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-05-30 11:36:04,527 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-05-30 11:36:04,532 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-30 11:36:42,749 - BERTopic - Dimensionality - Completed ✓
2024-05-30 11:36:42,754 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-30 11:36:43,908 - BERTopic - Cluster - Completed ✓
2024-05-30 11:36:43,909 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-30 11:36:44,394 - BERTopic - Representation - Completed ✓
2024-05-30 11:36:44,396 - BERTopic - Topic reduction - Reducing number of topics
2024-05-30 11:36:44,658 - BERTopic - Topic reduction - Reduced number of topics from 31 to 21
2024-05-30 11:36:44,800 - BERTopic - Zeroshot Step 2 - Clustering doc

In [193]:
model.visualize_documents(docs, reduced_embeddings=umap_embeddings, hide_document_hover=True, hide_annotations=True)

In [194]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5787,0_contact_please_product_sorry,"[contact, please, product, sorry, team, office...",
1,0,269,energy and climate and sustainability,"[energy, climate, sustainability, sustainable,...",[save date tune powering digital economy energ...
2,1,102,customer service,"[customer, service, contact, directly, recomme...",[thank message recommend contact customer serv...
3,2,24,celebrations and joy,"[celebration, celebrating, celebrate, life, ho...",[feeling love sending huge shout everyone cele...
4,3,20,health,"[health, people, disease, wellbeing, programme...",[launched corporate health campaign showcase u...
5,4,678,1_chain_supply_logistics_train,"[chain, supply, logistics, train, learn, solut...",[help give customer power uptime software solu...
6,5,588,2_disease_patient_people_health,"[disease, patient, people, health, cancer, lea...",[dear sorry hear country live share contact de...
7,6,393,3_woman_career_gender_inclusive,"[woman, career, gender, inclusive, diversity, ...",[priority processed food company watch video l...
8,7,322,4_emission_climate_sustainability_carbon,"[emission, climate, sustainability, carbon, su...",[people living chronic respiratory disease man...
9,8,320,5_wind_energy_turbine_farm,"[wind, energy, turbine, farm, offshore, power,...",[international woman engineering value equal o...


#### Merge topics

In [169]:
topics_to_merge = []
model.merge_topics(docs, topics_to_merge)
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5576,-1_please_team_contact_thank,"[please, team, contact, thank, help, product, ...",[denis sorry hear global privacy legal regulat...
1,0,711,0_patient_disease_medicine_health,"[patient, disease, medicine, health, people, c...",[advanced research improve outcome investigati...
2,1,675,1_energy_climate_sustainability_emission,"[energy, climate, sustainability, emission, su...",[chief strategy sustainability officer explain...
3,2,615,2_chain_supply_logistics_train,"[chain, supply, logistics, train, learn, rail,...",[experience best world local expertise backed ...
4,3,499,3_woman_career_inclusive_community,"[woman, career, inclusive, community, diversit...",[important focus international woman highlight...
5,4,419,4_love_heart_holiday_christmas,"[love, heart, holiday, christmas, awesome, adv...",[heart deserves touch kindness thank spreading...
6,5,222,5_build_builder_building_time,"[build, builder, building, time, knock, set, d...","[build, build, building find time build]"
7,6,211,6_wind_turbine_farm_offshore,"[wind, turbine, farm, offshore, project, energ...",[wind industry report circularity solution win...
8,7,189,7_food_nutrition_cream_brand,"[food, nutrition, cream, brand, taste, plantba...",[bring progress life complementary business fr...
9,8,166,8_vessel_methanol_container_green,"[vessel, methanol, container, green, sailing, ...",[greeting captain brian sørensen sailing world...


#### Reducing outlier topics

In [190]:
new_topics = model.reduce_outliers(docs, topics)

  0%|          | 0/6 [00:00<?, ?it/s]


NotFittedError: Vocabulary not fitted or provided

In [None]:
model.update_topics(docs, topics=new_topics)



In [188]:
model.visualize_documents(docs, hide_document_hover=True, hide_annotations=True)

In [189]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5966,0_contact_please_sorry_product,"[contact, please, sorry, product, team, messag...",
1,0,269,energy and climate and sustainability,"[energy, climate, sustainability, sustainable,...",[save date tune powering digital economy energ...
2,1,102,customer service,"[customer, service, contact, directly, recomme...",[thank message recommend contact customer serv...
3,2,24,celebrations and joy,"[celebration, celebrating, celebrate, life, ho...",[feeling love sending huge shout everyone cele...
4,3,20,health,"[health, people, disease, wellbeing, programme...",[launched corporate health campaign showcase u...
5,4,628,1_disease_patient_people_health,"[disease, patient, people, health, cancer, lea...",[aftab sorry hear able access medicine need qu...
6,5,576,2_chain_supply_logistics_train,"[chain, supply, logistics, train, learn, rail,...",[create safety report need contact detail kind...
7,6,446,3_woman_career_inclusive_diversity,"[woman, career, inclusive, diversity, communit...",[working address unique need cardiorenal patie...
8,7,312,4_race_congratulation_year_season,"[race, congratulation, year, season, team, wee...",[revolutionizing decisionmaking supply chain u...
9,8,236,5_energy_emission_carbon_home,"[energy, emission, carbon, home, solution, tra...",[international woman engineering value equal o...


In [144]:
model.save("../data/out/bertopic_files/bertopic_model_2", serialization="pickle")



In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

for topic in set(model.topics_):
    create_wordcloud(model, topic=topic)