In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull() & df["cluster"].notnull()]

grouped = df.groupby("cluster")["text"].apply(lambda x: " ".join(x)).reset_index()
grouped.columns = ["cluster", "combined_text"]

nlp = spacy.load("en_core_web_sm")

def textrank_keywords(text, top_n=5):
    doc = nlp(text)
    candidates = [chunk.text for chunk in doc.noun_chunks]
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(candidates)
    scores = X.sum(axis=0).A1
    keywords = [vectorizer.get_feature_names_out()[i] for i in scores.argsort()[::-1][:top_n]]
    return keywords

for i, row in grouped.iterrows():
    print(f"\nCluster {row['cluster']}")
    print("TextRank:", textrank_keywords(row["combined_text"]))


Cluster -1
TextRank: ['edge', 'vertex', 'graph', 'edges', 'vertices']

Cluster 0
TextRank: ['person', 'hands', 'people', 'handshakes', 'number']

Cluster 1
TextRank: ['size', 'neighbors', 'set', 'subset', 'union']

Cluster 2
TextRank: ['matching', 'vertices', 'graph', 'perfect', 'maximum']

Cluster 3
TextRank: ['lecture', 'graphs', 'notion', 'graph', 'music']

Cluster 4
TextRank: ['lecture', 'mathematics', 'theory', 'mooc', 'number']

Cluster 5
TextRank: ['banks', 'island', 'bridge', 'islands', 'bank']

Cluster 6
TextRank: ['graph', 'people', 'edge', 'person', 'friends']

Cluster 7
TextRank: ['interval', 'point', 'distance', 'right', 'left']

Cluster 8
TextRank: ['graph', 'bipartite', 'vertices', 'edge', 'edges']

Cluster 9
TextRank: ['neighbors', 'neighbor', 'course', 'amp', 'node']

Cluster 10
TextRank: ['vertices', 'order', 'stack', 'ordering', 'vertex']

Cluster 11
TextRank: ['level', 'neighbors', 'vertices', 'vertex', 'common']

Cluster 12
TextRank: ['ones', 'contradiction', 'for