Load the data first. Make sure the preprocessing code has run in R and the csv file "python_data.csv" is uploaded for use in Python.


In [None]:
!pip install bitermplus

import bitermplus as btm
import numpy as np
import pandas as pd
import itertools
from tabulate import tabulate

# Importing data
df = pd.read_csv(
    'python_data.csv', skiprows=[0], names=['texts'])
texts = df['texts'].str.strip().tolist()

# Creating the biterms
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

We use the entropy score for optimalization of the topics K.

In [None]:
# Fit the BTM model with varying values for the topics K
K_values = [10, 15, 20, 25, 30, 35]

# Perform the iterations and calculate the mean entropy
for K in K_values:
    entropies = []
    for _ in range(5):  # Repeat 5 times for each K
        # Fit the BTM model
        model = btm.BTM(X, vocabulary, T=K, M=20)
        model.fit(biterms, iterations=1000)
        entropy = btm.entropy(model.matrix_topics_words_)
        entropies.append(entropy)

    mean_entropy = np.mean(entropies)
    print(f"K={K}, Mean Entropy: {mean_entropy}")

Tuning the optimal alpha and beta for the BTM topic model by comparing the coherence scores. The coherence scores are measured five times and the mean is presented (because each model is different).

The optimal number K as optimized above, has to be set.




In [None]:
# Define the parameter grid
param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 3.3],
    'beta': [0.01, 0.05, 0.1, 0.5]
}

table_data = []

# Perform grid search
for params in itertools.product(*param_grid.values()):
    alpha, beta = params

    coherence_scores = []

    # Run model and compute coherence scores 5 times (optimal topic is set to 15 here)
    for _ in range(5):
        model = btm.BTM(X, vocabulary, T=15, M=20, alpha=alpha, beta=beta)
        model.fit(biterms, iterations=1000)

        # Compute the coherence score
        coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
        avg_coherence = coherence.mean()

        coherence_scores.append(avg_coherence)

    # Calculate the mean coherence score
    mean_coherence = sum(coherence_scores) / len(coherence_scores)

    # Append parameter values and mean coherence score to the table data
    table_data.append([alpha, beta, mean_coherence])

# Print the mean coherence scores in a table
headers = ['Alpha', 'Beta', 'Mean Coherence']
print(tabulate(table_data, headers=headers, floatfmt=".4f"))