In [9]:
!pip install --upgrade scipy



In [4]:
!pip install scipy



In [1]:
!pip install contextualized-topic-models



In [2]:
!pip install bertopic



In [6]:
!pip install gensim



In [8]:
!pip install scipy==1.10.1

Collecting scipy==1.10.1
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
Successfully installed scipy-1.10.1


In [None]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
df = pd.read_excel("/content/cleaned_labeled_dataset.xlsx")
column_name = 'cleaned_tweets'
label_column = 'label'  # Column containing true labels

## LDA

In [None]:
# Ensure each tweet is a list of tokens (words)
df[column_name] = df[column_name].apply(lambda x: x.split() if isinstance(x, str) else x)

# Create a dictionary and a corpus for LDA
dictionary = corpora.Dictionary(df[column_name])
corpus = [dictionary.doc2bow(tweet) for tweet in df[column_name]]

# Set the number of topics
NUM_TOPICS = 7

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)

# Step 1: Create a DataFrame for Topic Words with Document Counts

# Get the topic words and their scores
topics = lda_model.show_topics(num_topics=NUM_TOPICS, num_words=10, formatted=False)

# Create a DataFrame to store each topic's most important words and document count
topic_words_df = pd.DataFrame(columns=['Topic_ID', 'Top_Words', 'Document_Count'])

for topic_id, words in topics:
    # Get top words and their scores
    top_words = ", ".join([f"{word}: {round(score, 4)}" for word, score in words])

    # Calculate the number of documents associated with this topic
    doc_count = sum([1 for doc in corpus if lda_model.get_document_topics(doc, minimum_probability=0.1)[0][0] == topic_id])

    # Add to DataFrame
    topic_words_df = pd.concat([topic_words_df, pd.DataFrame({'Topic_ID': [topic_id],
                                                             'Top_Words': [top_words],
                                                             'Document_Count': [doc_count]})])

# Step 2: Add Topic Columns to the Original DataFrame

# Initialize lists to store new columns
topic_ids = []
topic_representative_words = []
topic_confidences = []

# Iterate through each document in the corpus to get its dominant topic
for doc in corpus:
    # Get the most dominant topic for the document with its confidence score
    topics_for_doc = lda_model.get_document_topics(doc)
    dominant_topic_id, confidence = sorted(topics_for_doc, key=lambda x: x[1], reverse=True)[0]

    # Get representative words with their scores
    topic_words_with_scores = lda_model.show_topic(dominant_topic_id, topn=10)
    representative_words = ", ".join([f"{word}: {round(score, 4)}" for word, score in topic_words_with_scores])

    # Append values to lists
    topic_ids.append(dominant_topic_id)
    topic_representative_words.append(representative_words)
    topic_confidences.append(confidence)

# Add new columns to the original DataFrame
df['Topic_ID'] = topic_ids
df['Topic_Representative_Words'] = topic_representative_words
df['Topic_Confidence'] = topic_confidences

In [None]:
topic_words_df.to_excel('LDA_topic_words_df.xlsx', index=False)

In [None]:
df.to_excel('LDA_labeled_df.xlsx', index=False)

In [None]:
# Calculate the purity for each topic

# Group by Topic_ID and calculate the purity for each topic
purity_data = []
for topic_id in df['Topic_ID'].unique():
    # Get the subset of the dataframe corresponding to the current topic
    topic_df = df[df['Topic_ID'] == topic_id]

    # Count the occurrences of each label within the topic
    label_counts = topic_df['label'].value_counts()

    # Identify the majority label and its count
    majority_label_count = label_counts.max()
    topic_size = len(topic_df)

    # Calculate the purity
    purity = majority_label_count / topic_size

    # Store the results
    purity_data.append({'Topic_ID': topic_id, 'Purity': purity, 'Majority_Label': label_counts.idxmax(), 'Topic_Size': topic_size})

# Create a DataFrame to store purity results
purity_df = pd.DataFrame(purity_data)


In [None]:
purity_df

## CTM

In [None]:
# Step 1: Prepare Data for CTM

# Convert cleaned tweets to a list of strings (required format for CTM)
documents = df[column_name].tolist()

# Instantiate the data preparation object
# tp = TopicModelDataPreparation("HooshvareLab/bert-fa-zwnj-base")  # Using MiniLM sentence transformer
tp = TopicModelDataPreparation("myrkur/sentence-transformer-parsbert-fa")  # Using MiniLM sentence transformer

# Prepare training data for CTM
training_dataset = tp.fit(text_for_contextual=documents, text_for_bow=documents)

# Step 2: Train the CTM Model

# Set the number of topics
NUM_TOPICS = 7

In [None]:
# Get topic distribution for each document
topic_distributions = ctm_model.get_doc_topic_distribution(training_dataset)

# Assign the most probable topic to each document
dominant_topic_ids = topic_distributions.argmax(axis=1)  # Get the index of the highest probability topic

# Add Topic_ID to the original DataFrame
df['Topic_ID'] = dominant_topic_ids

# Get the word distribution matrix for topics (rows are topics, columns are words)
topic_word_distribution = ctm_model.get_topic_word_distribution()

# Create a vocabulary list
vocab = tp.vocab

# Create a dictionary to store topic representative words with confidence scores
topic_representative_words = {}
for idx, topic_dist in enumerate(topic_word_distribution):
    # Get the top 10 words for each topic with their probabilities
    top_word_indices = topic_dist.argsort()[-10:][::-1]
    representative_words = ", ".join([f"{vocab[i]}: {round(topic_dist[i], 4)}" for i in top_word_indices])
    topic_representative_words[idx] = representative_words

# Add Topic_Representative_Words to the DataFrame based on the dominant topic
df['Topic_Representative_Words'] = df['Topic_ID'].map(topic_representative_words)

# Calculate confidence for each document's dominant topic
df['Topic_Confidence'] = [topic_distributions[i, topic_id] for i, topic_id in enumerate(dominant_topic_ids)]

# Step 4: Calculate Purity for Each Topic

purity_data = []
for topic_id in df['Topic_ID'].unique():
    # Get the subset of the dataframe corresponding to the current topic
    topic_df = df[df['Topic_ID'] == topic_id]

    # Count the occurrences of each label within the topic
    label_counts = topic_df[label_column].value_counts()

    # Identify the majority label and its count
    majority_label_count = label_counts.max()
    topic_size = len(topic_df)

    # Calculate the purity
    purity = majority_label_count / topic_size

    # Store the results
    purity_data.append({
        'Topic_ID': topic_id,
        'Purity': purity,
        'Majority_Label': label_counts.idxmax(),
        'Topic_Size': topic_size
    })

# Create a DataFrame to store purity results
purity_df = pd.DataFrame(purity_data)

In [None]:
purity_df

In [None]:
purity_df.to_excel('CTM_purity_df.xlsx', index=False)

In [None]:
df.to_excel('CTM_labeled_df.xlsx', index=False)

## BERTopic

In [None]:
embedding_model = SentenceTransformer("myrkur/sentence-transformer-parsbert-fa")
cluster_model = KMeans(n_clusters=7)

docs = df['cleaned_tweets'].tolist()

topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model, calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs)

info_df = topic_model.get_topic_info()

In [None]:
info_df.to_excel('BERTopic_kmeans_info_df.xlsx', index=False)

In [None]:
info_df

In [None]:
document_info = topic_model.get_document_info(docs)
document_info

In [None]:
document_info.to_excel('BERTopic_kmeans_document_df.xlsx', index=False)

In [None]:
# Ensure 'cleaned_tweets' and 'Document' columns have the same type before merging
df['cleaned_tweets'] = df['cleaned_tweets'].astype(str)
document_info['Document'] = document_info['Document'].astype(str)

# Perform the merge operation
merged_df = pd.merge(df, document_info, left_on='cleaned_tweets', right_on='Document', how='left')
merged_df

In [None]:
# Step 1: Initialize a list to store purity results for each topic
purity_data = []

# Step 2: Iterate over each unique topic in the DataFrame
for topic_id in merged_df['Topic'].unique():

    # Step 3: Get the subset of the dataframe corresponding to the current topic
    topic_df = merged_df[merged_df['Topic'] == topic_id]

    # Step 4: Count the occurrences of each label within the topic
    label_counts = topic_df['label'].value_counts()

    # Step 5: Identify the majority label and its count
    majority_label_count = label_counts.max()
    majority_label = label_counts.idxmax()
    topic_size = len(topic_df)

    # Step 6: Calculate purity for this topic
    purity = majority_label_count / topic_size

    # Step 7: Store the results in the purity_data list
    purity_data.append({
        'Topic_ID': topic_id,
        'Purity': purity,
        'Majority_Label': majority_label,
        'Topic_Size': topic_size
    })

# Step 8: Create a DataFrame to store purity results
purity_df = pd.DataFrame(purity_data)

# Step 9: Display the purity DataFrame
print("Purity DataFrame:")
purity_df