# **Install and Load Required Libraries**

In [None]:
!pip install bertopic

In [None]:
!pip install sentence-transformers

In [24]:
from bertopic import BERTopic

In [18]:
from sentence_transformers import SentenceTransformer

In [1]:
import pandas as pd

# **Load and Interpret Dataset**

Dataset Kaggle Link: https://www.kaggle.com/datasets/edqian/twitter-climate-change-sentiment-dataset/data

In [6]:
tweets_df = pd.read_csv("/content/drive/MyDrive/ict-606-aliasghar-2/twitter_sentiment_data.csv")

In [7]:
tweets_df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [8]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43943 entries, 0 to 43942
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  43943 non-null  int64 
 1   message    43943 non-null  object
 2   tweetid    43943 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [9]:
tweets_df['sentiment'].value_counts()

sentiment
 1    22962
 2     9276
 0     7715
-1     3990
Name: count, dtype: int64

In [11]:
# Select a balanced subset of 500 samples per sentiment
balanced_data = tweets_df.groupby('sentiment', group_keys=False).apply(lambda x: x.sample(min(len(x), 600), random_state=42))

In [12]:
balanced_data['sentiment'].value_counts()

sentiment
-1    600
 0    600
 1    600
 2    600
Name: count, dtype: int64

# **Data Preprocessing**

In [14]:
# Remove URLs
balanced_data['message'] = balanced_data['message'].str.replace(r"http\S+", "", regex=True)
# Remove mentions
balanced_data['message'] = balanced_data['message'].str.replace(r"@\S+", "", regex=True)
# Remove special characters
balanced_data['message'] = balanced_data['message'].str.replace(r"[^A-Za-z0-9\s]", " ", regex=True)
# Remove leading and trailing spaces
balanced_data['message'] = balanced_data['message'].str.strip()

# convert to lowercase
balanced_data['message'] = balanced_data['message'].str.lower()

In [15]:
balanced_data.head()

Unnamed: 0,sentiment,message,tweetid
19851,-1,rt bill nye whines at cnn for having actual s...,856190440516857856
25261,-1,rt dr roger pielke jr the science ain t t...,895910415435268096
22078,-1,rt how bout you refund the billions you ve st...,870499885174243328
12604,-1,can t tax us for this it s not climate change,827495307885805569
22463,-1,rt celebrities who couldn t drag hillary s bl...,871391633585000448


# **Topic Modeling - BERTopic**

In [32]:
# Initialize BERTopic with a sentence transformer model
topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2', min_topic_size=10, nr_topics="auto",calculate_probabilities=True)

# Fit BERTopic to your text data
topics, probabilities = topic_model.fit_transform(balanced_data['message'].tolist())


In [33]:
# Get an overview of the topics
topic_info = topic_model.get_topic_info()
print(topic_info.head())

   Topic  Count                          Name  \
0     -1   1015      -1_climate_change_the_rt   
1      0    172      0_real_climate_change_it   
2      1    170         1_trump_he_to_climate   
3      2     87     2_global_warming_hot_fuck   
4      3     59  3_obama_executive_order_isis   

                                      Representation  \
0  [climate, change, the, rt, of, to, global, war...   
1  [real, climate, change, it, is, we, not, you, ...   
2  [trump, he, to, climate, change, president, rt...   
3  [global, warming, hot, fuck, weather, red, coo...   
4  [obama, executive, order, isis, trump, to, syr...   

                                 Representative_Docs  
0  [rt  do you believe in climate change, climate...  
1  [climate change isn t real, what are we going ...  
2  [rt  trump has called climate change a hoax  n...  
3  [that climate change   global warming is a b, ...  
4  [rt  last night  president trump released an e...  


In [34]:
# Explore individual topics
for i in range(5):
    print(f"Topic {i}:", topic_model.get_topic(i))

Topic 0: [('real', 0.02477339767623893), ('climate', 0.021615504235810427), ('change', 0.02143747020237833), ('it', 0.020319995566850064), ('is', 0.02000298370945687), ('we', 0.016652212197140054), ('not', 0.01646317863146391), ('you', 0.01632666138402456), ('the', 0.016241056697253366), ('to', 0.015932880885697276)]
Topic 1: [('trump', 0.05940216559486661), ('he', 0.02276523642200662), ('to', 0.020001845573727597), ('climate', 0.018218928320716504), ('change', 0.01783248376650648), ('president', 0.017762448869797956), ('rt', 0.015566496847908045), ('000', 0.014020010233764741), ('donald', 0.013435944981769598), ('the', 0.01342868733520537)]
Topic 2: [('global', 0.06313962120743523), ('warming', 0.06195492811633997), ('hot', 0.026422282227018343), ('fuck', 0.02147023134341826), ('weather', 0.021245917695724488), ('red', 0.020773657870337105), ('cooling', 0.020683691559369582), ('rt', 0.020588763608265212), ('it', 0.019688892145581897), ('damn', 0.018838147698957114)]
Topic 3: [('obama'

# **Visualizations**

In [35]:
# Visualize topics
topic_model.visualize_topics()

In [36]:
# You can select any document index to visualize its topic probability distribution
document_index = 0  # Adjust based on the specific document you're interested in
topic_model.visualize_distribution(probabilities[document_index], min_probability=0.01)

In [37]:
# Reduce the number of topics
topic_model.update_topics(balanced_data['message'].tolist(), topics, n_gram_range=(1, 2))

# Visualize the hierarchical reduction of topics
hierarchical_fig = topic_model.visualize_hierarchy()
hierarchical_fig.show()

In [38]:
# Visualize the similarity between topics
similarity_fig = topic_model.visualize_heatmap()
similarity_fig.show()