In [None]:
import pandas as pd
from top2vec import Top2Vec


df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull()]

texts = df["text"].tolist()
print(f"Training Top2Vec on {len(texts)} documents...")


model = Top2Vec(texts, speed="learn", workers=4)


topic_words, word_scores, topic_nums = model.get_topics()


n_docs_per_topic = 3


csv_data = []

for topic_id, (words, scores, topic_num) in enumerate(zip(topic_words, word_scores, topic_nums)):
    docs, doc_scores, doc_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=n_docs_per_topic)

    
    csv_data.append({
        "topic_num": topic_num,
        "top_words": ", ".join(words),
        "word_scores": ", ".join([f"{s:.4f}" for s in scores]),
        "representative_docs": " ||| ".join(docs)
    })


output_df = pd.DataFrame(csv_data)
output_df.to_csv("top2vec_clustered_topics.csv", index=False)

print("Topics and representative documents saved to top2vec_clustered_topics.csv")
