In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
import spacy

df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull() & df["cluster"].notnull()]

grouped = df.groupby("cluster")["text"].apply(lambda x: " ".join(x)).reset_index()
grouped.columns = ["cluster", "combined_text"]

nlp = spacy.load("en_core_web_sm")

def apply_lda(texts, num_topics=1, num_words=5):
    tokenized = [[token.text.lower() for token in nlp(text) if token.is_alpha and not token.is_stop] for text in texts]
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(text) for text in tokenized]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    return [[word for word, prob in lda_model.show_topic(i, topn=num_words)] for i in range(num_topics)]

for i, row in grouped.iterrows():
    print(f"\nCluster {row['cluster']}")
    print("LDA:", apply_lda([row["combined_text"]])[0])


Cluster -1
LDA: ['b', 'x', 'graph', 'okay', 'equal']

Cluster 0
LDA: ['person', 'hands', 'people', 'number', 'shook']

Cluster 1
LDA: ['hope', 'lecture', 'thank', 'class', 'modules']

Cluster 2
LDA: ['banks', 'island', 'bridge', 'islands', 'bank']

Cluster 3
LDA: ['s', 'size', 'neighbors', 'n', 'set']

Cluster 4
LDA: ['music', 'learn', 'lecture', 'graphs', 'graph']

Cluster 5
LDA: ['lecture', 'mathematics', 'theory', 'music', 'welcome']

Cluster 6
LDA: ['n', 'number', 'nt', 'edge', 'divide']

Cluster 7
LDA: ['node', 'leaf', 'neighbor', 'nodes', 'look']

Cluster 8
LDA: ['x', 'matching', 'perfect', 'prime', 's']

Cluster 9
LDA: ['matching', 'maximum', 'maximal', 'vertex', 'edges']

Cluster 10
LDA: ['matching', 'vertices', 'graph', 'seven', 'number']

Cluster 11
LDA: ['graph', 's', 'people', 'constructed', 'person']

Cluster 12
LDA: ['b', 'interval', 'zero', 'distance', 'point']

Cluster 13
LDA: ['prime', 'fraction', 'b', 'new', 'x']

Cluster 14
LDA: ['fraction', 'enumeration', 'second',