In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
import spacy

df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull() & df["cluster"].notnull()]

grouped = df.groupby("cluster")["text"].apply(lambda x: " ".join(x)).reset_index()
grouped.columns = ["cluster", "combined_text"]

nlp = spacy.load("en_core_web_sm")

def apply_lda(texts, num_topics=1, num_words=5):
    tokenized = [[token.text.lower() for token in nlp(text) if token.is_alpha and not token.is_stop] for text in texts]
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(text) for text in tokenized]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    return [[word for word, prob in lda_model.show_topic(i, topn=num_words)] for i in range(num_topics)]

for i, row in grouped.iterrows():
    print(f"\nCluster {row['cluster']}")
    print("LDA:", apply_lda([row["combined_text"]])[0])


Cluster -1
LDA: ['b', 'x', 'graph', 'equal', 'edge']

Cluster 0
LDA: ['person', 'hands', 'people', 'number', 'shook']

Cluster 1
LDA: ['s', 'size', 'neighbors', 'n', 'set']

Cluster 2
LDA: ['matching', 'graph', 'vertices', 'perfect', 'x']

Cluster 3
LDA: ['music', 'lecture', 'learn', 'graphs', 'graph']

Cluster 4
LDA: ['lecture', 'mathematics', 'theory', 'music', 'welcome']

Cluster 5
LDA: ['banks', 'island', 'islands', 'bank', 'bridge']

Cluster 6
LDA: ['graph', 's', 'person', 'people', 'vertex']

Cluster 7
LDA: ['b', 'interval', 'point', 'zero', 'distance']

Cluster 8
LDA: ['graph', 'bipartite', 's', 'vertices', 'y']

Cluster 9
LDA: ['neighbors', 'v', 'minus', 'n', 'neighbor']

Cluster 10
LDA: ['vertices', 'order', 'v', 'lets', 'vertex']

Cluster 11
LDA: ['level', 'neighbors', 'common', 'ancestor', 'look']

Cluster 12
LDA: ['okay', 'going', 'appear', 'ones', 'positive']

Cluster 13
LDA: ['numbers', 'thousand', 'belongs', 'number', 'e']

Cluster 14
LDA: ['bfs', 'vertex', 'search', 'g