In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [5]:
# Load preprocessed data
artifacts_dir = '../artifacts'

print("Loading preprocessed data for clustering analysis...")
data = joblib.load(os.path.join(artifacts_dir, 'preprocessed_data.joblib'))
vectorizer = joblib.load(os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))

# Extract data components - combine all splits for clustering
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']
feature_names = data['feature_names']
class_names = data['class_names']

# Combine all data for clustering (unsupervised learning)
from scipy.sparse import vstack
X_all = vstack([X_train, X_val, X_test])
y_all = pd.concat([y_train, y_val, y_test], ignore_index=True)

print(f"Combined dataset for clustering: {X_all.shape}")
print(f"Total documents: {len(y_all)}")
print(f"Feature dimensions: {X_all.shape[1]}")
print(f"True labels available for evaluation: {len(class_names)} classes")

Loading preprocessed data for clustering analysis...
Combined dataset for clustering: (2464, 5000)
Total documents: 2464
Feature dimensions: 5000
True labels available for evaluation: 13 classes


In [6]:
# Determine optimal number of clusters using multiple methods
print("Analyzing optimal number of clusters...")

# Test range of cluster numbers
k_range = range(2, 21)  # Test 2 to 20 clusters
inertias = []
silhouette_scores = []

print("Computing clustering metrics for different k values...")
for k in k_range:
    print(f"Testing k={k}...", end=' ')
    
    # Fit k-means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_all)
    
    # Calculate metrics
    inertias.append(kmeans.inertia_)
    silhouette_avg = silhouette_score(X_all, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    
    print(f"Silhouette: {silhouette_avg:.3f}")

print("\nCluster analysis completed!")

Analyzing optimal number of clusters...
Computing clustering metrics for different k values...
Testing k=2... Silhouette: 0.022
Testing k=3... Silhouette: 0.024
Testing k=4... Silhouette: 0.027
Testing k=5... Silhouette: 0.028
Testing k=6... Silhouette: 0.031
Testing k=7... Silhouette: 0.034
Testing k=8... Silhouette: 0.027
Testing k=9... Silhouette: 0.033
Testing k=10... Silhouette: 0.036
Testing k=11... Silhouette: 0.034
Testing k=12... Silhouette: 0.033
Testing k=13... Silhouette: 0.039
Testing k=14... Silhouette: 0.039
Testing k=15... Silhouette: 0.040
Testing k=16... Silhouette: 0.041
Testing k=17... Silhouette: 0.043
Testing k=18... Silhouette: 0.043
Testing k=19... Silhouette: 0.045
Testing k=20... Silhouette: 0.047

Cluster analysis completed!
