# Clustering Algorithms for Topic Modeling

### Objective
Apply LDA and K-Means to group similar documents and discover topics from text data.

### 1. Import Libraries

In [None]:

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


### 2. Load and Preprocess Dataset

In [None]:

# Load dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data[:1000]  # Use a subset for performance

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

documents = [preprocess(doc) for doc in documents]


### 3. LDA Topic Modeling

In [None]:

vectorizer_lda = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
doc_term_matrix = vectorizer_lda.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Display topics
for idx, topic in enumerate(lda.components_):
    print(f"\nTopic #{idx + 1}:")
    print([vectorizer_lda.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


### 4. K-Means Clustering

In [None]:

vectorizer_kmeans = TfidfVectorizer(stop_words='english')
X = vectorizer_kmeans.fit_transform(documents)

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)

order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer_kmeans.get_feature_names_out()

# Display cluster keywords
for i in range(5):
    print(f"\nCluster #{i + 1}:")
    print([terms[ind] for ind in order_centroids[i, :10]])


### 5. Conclusion


- **LDA** provides interpretable topics from documents.

- **K-Means** clusters documents based on feature similarity.

- Both techniques are valuable for understanding large text corpora.
