# 03 Topic Clustering

In [5]:
# 03_topic_clustering.ipynb



import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load dataset
df = pd.read_csv("../data/raw/arxiv_papers.csv")

# Preprocess summaries
stop_words = set(stopwords.words("english"))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return " ".join([word for word in tokens if word.isalpha() and word not in stop_words])

df["processed_summary"] = df["summary"].apply(preprocess)

# Convert text to bag-of-words format
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df["processed_summary"])

# Apply LDA using sklearn
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Show topics
words = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"\n🧠 Topic #{idx + 1}:")
    print(" ".join([words[i] for i in topic.argsort()[-10:][::-1]]))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aabharan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aabharan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



🧠 Topic #1:
models model learning data using dataset segmentation performance training datasets

🧠 Topic #2:
generative learning data model image models neural propose methods framework

🧠 Topic #3:
network learning spatial model training data models energy computational time

🧠 Topic #4:
methods metrics information performance different framework propose domain propagation tasks

🧠 Topic #5:
learning models network modeling model approach networks analysis layers challenges
