Load the data first. Make sure the preprocessing code has run in R and the csv file "python_data.csv" is uploaded for use in Python.

In [None]:
!pip install bitermplus

import bitermplus as btm
import numpy as np
import pandas as pd
import itertools
from tabulate import tabulate

# Importing data
df = pd.read_csv(
    'python_data.csv', skiprows=[0], names=['texts'])
texts = df['texts'].str.strip().tolist()

# Creating the biterms
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

Running the BTM model with the optimized parameters

In [None]:
model = btm.BTM(X, vocabulary, seed=920, T=15, M=20, alpha=0.1, beta=0.01)
model.fit(biterms, iterations=1000)

The model with the created topics and the top 10 words printed.

In [None]:
!pip install tmplot
!pip install tomotopy

import tomotopy
import tmplot as tmp

p_zd = model.transform(docs_vec)
phi = tmp.get_phi(model)

selected_topics = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

# Create an empty DataFrame to store the top words for the selected topics
df_selected_topics = pd.DataFrame()

# Iterate over each selected topic
for topic in selected_topics:
    terms_probs = tmp.calc_terms_probs_ratio(phi, topic=topic, lambda_=0.6)
    df = pd.DataFrame(terms_probs)
    selected_rows = df[df['Type'] == 'Conditional term probability, p(w|t)']
    selected_rows_sorted = selected_rows.sort_values('Probability', ascending=False)

    # Add the 'Terms' column for the current topic to the DataFrame
    df_selected_topics[f'Topic {topic}'] = selected_rows_sorted['Terms'].head(10).values

# Print the DataFrame with the top words for the selected topics
print(df_selected_topics)

Visualization of the model:

In [None]:
tmp.report(model=model, docs=texts)



Calculating the coherence score based on the top 5, 10 and 20 words.

In [None]:
coherence = btm.coherence(model.matrix_topics_words_, X, M=5)
mean_coherence = np.mean(coherence)
ci = np.percentile(coherence, [2.5, 97.5])
print(mean_coherence)
print(ci)

coherence = btm.coherence(model.matrix_topics_words_, X, M=10)
mean_coherence = np.mean(coherence)
ci = np.percentile(coherence, [2.5, 97.5])
print(mean_coherence)
print(ci)

coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
mean_coherence = np.mean(coherence)
ci = np.percentile(coherence, [2.5, 97.5])
print(mean_coherence)
print(ci)
