### k-means Clustering on SVD Dataset

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD  # For dimensionality reduction, if needed
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Define and fit the KMeans model
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1)
# Uncomment and configure MiniBatchKMeans if working with very large datasets
# km = MiniBatchKMeans(n_clusters=3, init='k-means++', n_init=1, init_size=1000, batch_size=1000)
print("Clustering sparse data with %s" % km)
km.fit(X)

In [None]:
# Optional: Dimensionality reduction using SVD (if applied)
# svd = TruncatedSVD(n_components=100)  # Example, adjust as needed
# original_space_centroids = svd.inverse_transform(km.cluster_centers_)
# Use this line if you are using SVD
# order_centroids = original_space_centroids.argsort()[:, ::-1]
# Otherwise, use the following line
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [None]:
# Get feature names
terms = vectorizer.get_feature_names_out()

In [None]:
# Print top terms per cluster
print("Top terms per cluster:")
for i in range(3):
    print(f"Cluster {i}: ", end='')
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}', end='')
    print()

### Hierarchical Clustering on SVD Dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram
from scipy.spatial.distance import squareform

In [None]:
# Compute the distance matrix from cosine similarity
dist = 1 - cosine_similarity(X)

In [None]:
# Convert the distance matrix to a condensed format
dist = squareform(dist)

In [None]:
# Perform hierarchical clustering using the Ward method
linkage_matrix = ward(dist)

In [None]:
# Plot the dendrogram
fig, ax = plt.subplots(figsize=(10, 8))  # Set figure size
dendrogram(linkage_matrix, orientation='right', ax=ax)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Distance')
plt.ylabel('Sample index')
plt.tight_layout()  # Adjust layout
plt.show()