# Model Training 
This section will train the BERT model with the cleaned dataset.

### Import required Library

### Download sentence transformer and encoder for BERT

In [None]:
# Importing all required libraries
from bertopic import BERTopic
import pandas as pd 
import os
import pickle

### Import the cleaned dataset for BERT

In [None]:
DATADIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/dataset"
cleaned_df = pd.read_csv(f"{DATADIR}/cleaned_tweets.csv")

cleaned_df.head()

### Train the BERT model
This section will train the bert model if it does not exist. If not it will import the exisiting model to save time. 

In [None]:
MODELDIR = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/model"
model_path = f"{MODELDIR}/bert"

def train_model():
    if not os.path.exists(MODELDIR):
        os.makedirs(MODELDIR)
    bert_model = BERTopic(verbose=True)
    tweets = cleaned_df['cleaned_tweet'].to_list()
    topics, probabilities = bert_model.fit_transform(tweets)
    bert_model.save(f"{MODELDIR}/bert", serialization="pickle")
    # save_model(bert_model, "bert")

if os.path.isfile(model_path):
    bert_model = BERTopic.load(model_path)
else: 
    train_model()


## Interpretation of Results 

### Topic Information
The `get_topic_info` method provides an overview of the topics identified by the model, inclluding their size

In [None]:
bert_model.get_topic_info()

### Top Words in Each Topic
The `get_topic` method returns the top words for a specific topic. This can help in understanding the main themes of each topic.

In [None]:
bert_model.get_topic(0)

### Topic Frequency
The `get_topic_freq` method shows the frequency of each topic, which helps in identifying the most dominant topics.

In [None]:
# Show the size of topics in descending order
bert_model.get_topic_freq()

## Visualisation for BERT model

In [None]:
bert_model.visualize_topics()

### Visualise Terms 
This method will show a few selected terms in bar chart format of the TF-IDF scores.

In [None]:
bert_model.visualize_barchart()

### Visualise Topic Similarity 
This method will visualise how similar certain topics are to each other using a heatmap.

In [None]:
bert_model.visualize_heatmap()

### Visualise Topics Hierarchy
This method will visualize the topics hierarchy.

In [None]:
bert_model.visualize_hierarchy()

### Visualise Topic Word Cloud 
Word clouds can provide an intuitive way to understand the most frequent words in each topic

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud().generate_from_frequencies(text)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

# Show word cloud
create_wordcloud(bert_model, 0)

## Unsupervised Topic Modeling Evaluation 

### Import necessary packages

`gensim` will be used for Coherence & `scikit-learn` will be used for Silhouette Score

In [None]:
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

In [None]:
# Prepare the tweets 
tweets = cleaned_df['cleaned_tweet'].to_list()
texts = [tweet.split() for tweet in tweets]

# create a Gensim dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# get topic in the format required by Gensim
topics = bert_model.get_topics()
formatted_topics = [[word for word, _ in topic] for topic in topics.values()]

# Calculate Coherence Score using Gensim
coherence_model = CoherenceModel(topics=formatted_topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

f"Coherence Score: {coherence_score}"

### Silhouette Score with scikit-learn
This will take some time depending on user system.

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle

# shuffle and subsample the data 
subsample_size = 1000 
cleaned_df_subsample = shuffle(cleaned_df).head(subsample_size)

# get topic assignments and probabilities
topics, probabilities = bert_model.fit_transform(cleaned_df_subsample['cleaned_tweet'])

# convert the probabilities to a 2d array
probabilities_2d = pd.DataFrame(probabilities).values

# calculate Silhouette Score using topic probabilities 
sil_score = silhouette_score(probabilities_2d, topics)
f"Silhouette Score: {sil_score}"