In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load preprocessed data
artifacts_dir = '../artifacts'

print("Loading preprocessed data for clustering analysis...")
data = joblib.load(os.path.join(artifacts_dir, 'preprocessed_data.joblib'))
vectorizer = joblib.load(os.path.join(artifacts_dir, 'tfidf_vectorizer.joblib'))

# Extract data components - combine all splits for clustering
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']
feature_names = data['feature_names']
class_names = data['class_names']

# Combine all data for clustering (unsupervised learning)
from scipy.sparse import vstack
X_all = vstack([X_train, X_val, X_test])
y_all = pd.concat([y_train, y_val, y_test], ignore_index=True)

print(f"Combined dataset for clustering: {X_all.shape}")
print(f"Total documents: {len(y_all)}")
print(f"Feature dimensions: {X_all.shape[1]}")
print(f"True labels available for evaluation: {len(class_names)} classes")