In [None]:
import json

import bertopic
import gensim
import numpy as np
import pandas as pd

In [None]:
data_directory = "data/"

In [None]:
data = pd.read_parquet(f"{data_directory}/data.parquet").to_dict(orient="records")

In [None]:
with open(f"{data_directory}/vocabulary.json") as f:
    vocabulary = json.load(f)

# LDA

In [None]:
# LDA run
X_bow = gensim.matutils.Scipy2Corpus(np.array([d["bow"] for d in data]))
num_topicss = [50, 100, 150, 200, 250, 300]
num_repeats = 5

idx2vocab = {v["idx"]: k for k, v in vocabulary.items()}


def run_lda(num_topics):
    lda = gensim.models.LdaModel(X_bow, num_topics=num_topics, id2word=idx2vocab)
    topic_probs = lda.show_topics(
        num_topics=num_topics,
        formatted=False,
    )
    topics = [[word for word, _ in topic] for _, topic in topic_probs]
    return topics

for num_topics in num_topicss:
    print(f"num_topics={num_topics}")
    for i in range(num_repeats):
        print(f"repeat={i}")
        topics = run_lda(num_topics)
        output = {
            "model": "lda",
            "num_topics": num_topics,
            "topics": topics,
        }
        with open(f"ignore/output.jsonl", "a") as f:
            f.write(json.dumps(output) + "\n")
X_tfidf = gensim.matutils.Scipy2Corpus(np.array([d["tfidf"] for d in data]))

def run_lda_tfidf(num_topics):
    lda = gensim.models.LdaModel(X_tfidf, num_topics=num_topics, id2word=idx2vocab)
    topic_probs = lda.show_topics(
        num_topics=num_topics,
        formatted=False,
    )
    topics = [[word for word, _ in topic] for _, topic in topic_probs]
    return topics


for num_topics in num_topicss:
    print(f"num_topics={num_topics}")
    for i in range(num_repeats):
        print(f"repeat={i}")
        topics = run_lda(num_topics)
        output = {
            "model": "lda_tf_idf",
            "num_topics": num_topics,
            "topics": topics,
        }
        with open(f"ignore/output_lda.jsonl", "a") as f:
            f.write(json.dumps(output) + "\n")

# BERTopic

In [None]:
texts = [d["text"] for d in data]
embeddings = np.array([d["embedding"] for d in data])

In [None]:
def fit_bertopic(num_topics):
    model = bertopic.BERTopic(language="multilingual", nr_topics=num_topics)
    model.fit_transform(texts, embeddings)
    topics = model.get_topic_info()["Representation"].tolist()
    return topics


num_topicss = [50, 100, 150, 200, 250, 300]
num_repeats = 5

for num_topics in num_topicss:
    print(f"num_topics={num_topics}")
    for i in range(num_repeats):
        print(f"repeat={i}")
        topics = fit_bertopic(num_topics)
        output = {
            "model": "bertopic",
            "num_topics": num_topics,
            "topics": topics,
        }
        with open(f"ignore/output_lda.jsonl", "a") as f:
            f.write(json.dumps(output) + "\n")

num_topics=50
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4
num_topics=100
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4
num_topics=150
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4
num_topics=200
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4
num_topics=250
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4
num_topics=300
repeat=0
repeat=1
repeat=2
repeat=3
repeat=4


# CTM

In [None]:
import json

import numpy as np
import pandas as pd

import octis.models.contextualized_topic_models.datasets.dataset
import octis.models.contextualized_topic_models.models.ctm


def fit_ctm(data, dataset, num_topics, num_epochs, vocabulary):
    model = octis.models.contextualized_topic_models.models.ctm.CTM(
        input_size=len(vocabulary),
        bert_input_size=len(data[0]["embedding"]),
        num_topics=num_topics,
        num_epochs=num_epochs,
    )
    model.fit(dataset)
    topics = model.get_topics()

    return topics


def run():
    dataset = octis.models.contextualized_topic_models.datasets.dataset.CTMDataset(
        # X=np.array([d["bow"] for d in data]),
        X=np.array([d["tfidf"] for d in data]),
        X_bert=np.array([d["embedding"] for d in data]),
        idx2token={i["idx"]: word for word, i in vocabulary.items()},
    )

    num_topicss = [50, 100, 150, 200, 250, 300]
    num_epochs = 100
    num_repeats = 5

    for num_topics in num_topicss:
        for i in range(num_repeats):
            topics = fit_ctm(data, dataset, num_topics, num_epochs, vocabulary)
            output = {
                # "model": "ctm",
                "model": "ctm_tfidf",
                "num_topics": num_topics,
                "topics": topics,
            }
            with open("ignore/output.jsonl", "a") as f:
                f.write(json.dumps(output) + "\n")

run()