In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
stop_words = stopwords.words('english')

import warnings

# Suppress the specific deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
# Fetch the dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['rec.sport.hockey', 'sci.space', 'comp.graphics'], remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

# Display a sample
print("Sample document:\n", documents[0])

Sample document:
 
    Yes.  There are many methods of rendering, raytracing is one of them.
You didn't say what you mean by rendering, so I won't guess.  Methods of
rendering include:

        o  Pencil and graph paper, doing the math by hand

        o  Wireframe rendering of the 2D projection

        o  Hidden line rendering

        o  Scanline rendering using:
            - Painter's algorithm.
            - BSP trees.
            - Z buffer
            - Other

        o  Raytracing

        o  Radiosity

        o  Holographic projection to film

        o  Combination of any of the above


In [4]:
def preprocess(documents):
    processed_docs = []
    for doc in documents:
        tokens = doc.lower().split()
        filtered_tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
        processed_docs.append(" ".join(filtered_tokens))
    return processed_docs

processed_docs = preprocess(documents)
print("Sample preprocessed document:\n", processed_docs[0])

Sample preprocessed document:
 many methods raytracing one say mean methods rendering pencil graph math hand wireframe rendering projection hidden line rendering scanline rendering bsp z buffer raytracing radiosity holographic projection film combination


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for simplicity
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)

print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (2959, 1000)


In [6]:
num_clusters = 3
km_model = KMeans(n_clusters=num_clusters, random_state=42)
km_model.fit(tfidf_matrix)

clusters = km_model.labels_.tolist()
print("Sample Cluster Assignments:", clusters[:10])

Sample Cluster Assignments: [0, 2, 2, 0, 0, 1, 0, 1, 1, 0]


In [7]:
from collections import defaultdict

# Group documents by their assigned cluster
clustered_docs = defaultdict(list)
for idx, cluster in enumerate(clusters):
    clustered_docs[cluster].append(processed_docs[idx])

# Display four sample documents for each cluster
num_samples = 4  # Number of documents to display per cluster
for cluster_id, docs in clustered_docs.items():
    print(f"\nCluster {cluster_id} (Topic {cluster_id + 1}):")
    print(f"Number of documents in this cluster: {len(docs)}")
    print(f"Displaying {min(num_samples, len(docs))} sample documents:\n")

    for i in range(min(num_samples, len(docs))):
        print(f"Sample document {i + 1}:\n{''.join(docs[i][:100])}...\n")  # Displaying the first 100 words


Cluster 0 (Topic 1):
Number of documents in this cluster: 1577
Displaying 4 sample documents:

Sample document 1:
many methods raytracing one say mean methods rendering pencil graph math hand wireframe rendering pr...

Sample document 2:
good reason many companies use bits pixel use double buffering animating write one entire screen upd...

Sample document 3:
working system uses given set key frame positions control imaginary camera using splines described s...

Sample document 4:
...


Cluster 2 (Topic 3):
Number of documents in this cluster: 754
Displaying 4 sample documents:

Sample document 1:
borrowed version book wealth pages information electronic artists organizations around globe email m...

Sample document 2:
posted may distribute program freely use fee author responsible damage caused important changes sinc...

Sample document 3:
looking source orbital element sets believe one please let know possible sources reach thanks...

Sample document 4:
hi recently found xv subdirec

In [18]:
lsa_model = TruncatedSVD(n_components=3, random_state=42)
lsa_topic_matrix = lsa_model.fit_transform(tfidf_matrix)
lsa_topic_matrix.shape

(2959, 3)

In [34]:
num_samples = 4

for topic_idx in range(lsa_model.n_components):
    topic_column = lsa_topic_matrix[:, topic_idx]
    top_doc_indices = np.argsort(topic_column)[::-1][:num_samples]
    print(top_doc_indices, "space", topic_column)    

[1189 1935 1670  870] space [0.0578136  0.13448193 0.29303069 ... 0.13180603 0.26704499 0.06709812]
[ 870 1670  310 1480] space [0.00207832 0.10956224 0.2794804  ... 0.05678892 0.14614462 0.05643416]
[2816 1868 1714 1826] space [-0.00366969 -0.01670209 -0.09512956 ... -0.07766955  0.09103031
 -0.02987059]
