In [16]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
# Configuration
EMBEDDERS = {
    "BAAI/bge-large-en": {"type": "huggingface", "model": "BAAI/bge-large-en"},
    "all-MiniLM-L6-v2": {"type": "sentence-transformers", "model": "all-MiniLM-L6-v2"},
    "nomic-embed-text-v2-moe": {"type": "huggingface", "model": "nomic-ai/nomic-embed-text-v2-moe", "trust_remote_code": True}
}

In [None]:
class DocumentClusterAnalyzer:
    def __init__(self, data_dir=None, use_gpu=True):
        self.data_dir = data_dir
        self.documents = []
        self.filenames = []
        self.results = {}
        self.current_embedder = None
        self.current_model = None
        self.current_tokenizer = None
        self.device = self.get_device(use_gpu)
        print(f"Using device: {self.device}")

    def get_device(self, use_gpu=True):
        """Determine the best device to use"""
        if use_gpu and torch.cuda.is_available():
            return torch.device("cuda")
        elif use_gpu and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
            return torch.device("mps")  # For Apple Silicon
        else:
            return torch.device("cpu")

    def load_documents(self):
        """Load all text files from the specified directory"""
        self.documents = []
        self.filenames = []

        if not self.data_dir or not os.path.exists(self.data_dir):
            print(f"Directory {self.data_dir} does not exist.")
            return self.documents, self.filenames

        txt_files = [f for f in os.listdir(self.data_dir) if f.endswith('.txt')]
        if not txt_files:
            print(f"No .txt files found in {self.data_dir}")
            return self.documents, self.filenames

        for filename in txt_files:
            filepath = os.path.join(self.data_dir, filename)
            try:
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                    content = file.read().strip()
                    if content:  # Only add non-empty files
                        self.documents.append(content)
                        self.filenames.append(filename)
            except Exception as e:
                print(f"Error reading {filename}: {str(e)}")

        print(f"Loaded {len(self.documents)} documents")
        return self.documents, self.filenames

    def cleanup_embedder(self):
        """Clean up the current embedder to free memory"""
        if self.current_model is not None:
            del self.current_model
            self.current_model = None

        if self.current_tokenizer is not None:
            del self.current_tokenizer
            self.current_tokenizer = None

        self.current_embedder = None

        # Force garbage collection
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    @staticmethod
    def mean_pooling(model_output, attention_mask):
        """Mean pooling for Hugging Face models"""
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def get_embeddings_huggingface(self, texts, model_name, trust_remote_code=False):
        """Get embeddings using Hugging Face models with GPU support"""
        if not texts:
            return np.array([])

        # Clean up any previous model
        self.cleanup_embedder()

        # Load new model
        self.current_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.current_model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=trust_remote_code,
            torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
        ).to(self.device)
        self.current_embedder = model_name

        # Process in batches to handle memory constraints
        batch_size = 16 if self.device.type == "cuda" else 8  # Larger batches on GPU
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]

            inputs = self.current_tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512
            ).to(self.device)

            with torch.no_grad():
                outputs = self.current_model(**inputs)

            # Move to CPU for numpy conversion
            batch_embeddings = self.mean_pooling(outputs, inputs['attention_mask']).cpu().numpy()
            embeddings.append(batch_embeddings)

        # Clean up after processing
        self.cleanup_embedder()

        return np.vstack(embeddings) if embeddings else np.array([])

    def get_embeddings_sentence_transformers(self, texts, model_name):
        """Get embeddings using SentenceTransformers with GPU support"""
        if not texts:
            return np.array([])

        # Clean up any previous model
        self.cleanup_embedder()

        # Import here to avoid unnecessary dependencies if not used
        from sentence_transformers import SentenceTransformer

        # Load new model with device support
        device = "cuda" if self.device.type == "cuda" else "cpu"
        self.current_model = SentenceTransformer(model_name, device=device)
        self.current_embedder = model_name

        # Process in batches
        batch_size = 64 if self.device.type == "cuda" else 32  # Larger batches on GPU
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.current_model.encode(
                batch_texts,
                batch_size=batch_size,
                convert_to_numpy=True,
                device=device
            )
            embeddings.append(batch_embeddings)

        # Clean up after processing
        self.cleanup_embedder()

        return np.vstack(embeddings) if embeddings else np.array([])

    def get_embeddings(self, embedder_name, embedder_config):
        """Get embeddings based on embedder type"""
        print(f"Generating embeddings using {embedder_name} on {self.device}...")

        if not self.documents:
            print("No documents to process")
            return np.array([])

        if embedder_config["type"] == "huggingface":
            trust_remote_code = embedder_config.get("trust_remote_code", True)
            return self.get_embeddings_huggingface(
                self.documents, embedder_config["model"], trust_remote_code
            )
        elif embedder_config["type"] == "sentence-transformers":
            return self.get_embeddings_sentence_transformers(self.documents, embedder_config["model"])
        else:
            raise ValueError(f"Unknown embedder type: {embedder_config['type']}")

    def find_optimal_clusters(self, embeddings, max_clusters=10):
        """Find optimal number of clusters using elbow method and silhouette analysis"""
        if len(embeddings) <= 2:
            return 1

        max_clusters = min(max_clusters, len(embeddings) - 1)
        wcss = []  # Within-cluster sum of squares
        silhouette_scores = []

        k_range = range(1, max_clusters + 1)

        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(embeddings)
            wcss.append(kmeans.inertia_)

            if k > 1:  # Silhouette score requires at least 2 clusters
                try:
                    silhouette_scores.append(silhouette_score(embeddings, clusters))
                except:
                    silhouette_scores.append(0)
            else:
                silhouette_scores.append(0)

        # Calculate the elbow point (using the knee point detection method)
        if len(wcss) > 2:
            deltas = np.diff(wcss)
            derivatives = np.diff(deltas)
            if len(derivatives) > 0:
                elbow_point = np.argmin(derivatives) + 2  # +2 because we started at k=1 and took two diffs
            else:
                elbow_point = 2
        else:
            elbow_point = 2

        # Use silhouette score to validate
        if len(silhouette_scores) > 1:
            optimal_k_silhouette = np.argmax(silhouette_scores[1:]) + 2  # +2 because we started at k=1
        else:
            optimal_k_silhouette = 1

        # Choose the best k based on both methods
        optimal_k = max(1, min(elbow_point, optimal_k_silhouette))

        # Plot the results
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

        # Elbow method plot
        ax1.plot(k_range, wcss, 'bo-')
        ax1.set_xlabel('Number of clusters')
        ax1.set_ylabel('WCSS')
        ax1.set_title('Elbow Method')
        ax1.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k: {optimal_k}')
        ax1.legend()

        # Silhouette score plot
        ax2.plot(k_range, silhouette_scores, 'bo-')
        ax2.set_xlabel('Number of clusters')
        ax2.set_ylabel('Silhouette Score')
        ax2.set_title('Silhouette Analysis')
        ax2.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k: {optimal_k}')
        ax2.legend()

        plt.tight_layout()
        plt.show()

        return optimal_k

    def perform_clustering(self, embeddings, n_clusters):
        """Perform K-means clustering and return results"""
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(embeddings)

        # Reduce dimensionality for visualization
        pca = PCA(n_components=2)
        reduced_embeddings = pca.fit_transform(embeddings)

        return clusters, reduced_embeddings, pca.explained_variance_ratio_

    def analyze_embedder(self, embedder_name, embedder_config):
        """Complete analysis for a single embedder"""
        print(f"\n{'='*60}")
        print(f"Analyzing with {embedder_name}")
        print(f"{'='*60}")

        # Get embeddings
        try:
            embeddings = self.get_embeddings(embedder_name, embedder_config)
            if embeddings.size == 0:
                print("No embeddings generated")
                return None

            print(f"Embedding shape: {embeddings.shape}")

            if len(self.documents) <= 1:
                print("Not enough documents for clustering")
                return None

            # Find optimal cluster count
            optimal_clusters = self.find_optimal_clusters(embeddings)
            # optimal_clusters = 3
            print(f"Optimal number of clusters: {optimal_clusters}")

            # Perform clustering
            clusters, reduced_embeddings, variance_ratio = self.perform_clustering(
                embeddings, optimal_clusters)

            # Calculate cluster statistics
            cluster_assignments = {}
            for i, cluster_id in enumerate(clusters):
                if cluster_id not in cluster_assignments:
                    cluster_assignments[cluster_id] = []
                cluster_assignments[cluster_id].append({
                    'index': i,
                    'filename': self.filenames[i],
                    'embedding': reduced_embeddings[i]
                })

            # Calculate intra-cluster and inter-cluster similarities
            similarity_matrix = cosine_similarity(embeddings)
            intra_cluster_similarities = []
            inter_cluster_similarities = []

            for cluster_id, items in cluster_assignments.items():
                indices = [item['index'] for item in items]

                # Intra-cluster similarities
                if len(indices) > 1:
                    cluster_similarities = similarity_matrix[np.ix_(indices, indices)]
                    np.fill_diagonal(cluster_similarities, 0)  # Remove self-similarities
                    intra_sum = np.sum(cluster_similarities) / (len(indices) * (len(indices) - 1))
                    intra_cluster_similarities.append(intra_sum)

                # Inter-cluster similarities
                other_indices = [i for i in range(len(embeddings)) if i not in indices]
                if other_indices:
                    inter_similarities = similarity_matrix[np.ix_(indices, other_indices)]
                    inter_sum = np.mean(inter_similarities)
                    inter_cluster_similarities.append(inter_sum)

            avg_intra_similarity = np.mean(intra_cluster_similarities) if intra_cluster_similarities else 0
            avg_inter_similarity = np.mean(inter_cluster_similarities) if inter_cluster_similarities else 0

            # Create interactive visualization
            self.create_interactive_plot(
                reduced_embeddings, clusters, embedder_name,
                variance_ratio, optimal_clusters
            )

            # Store results
            result = {
                'embeddings': embeddings,
                'reduced_embeddings': reduced_embeddings,
                'clusters': clusters,
                'cluster_assignments': cluster_assignments,
                'variance_ratio': variance_ratio,
                'optimal_clusters': optimal_clusters,
                'avg_intra_similarity': avg_intra_similarity,
                'avg_inter_similarity': avg_inter_similarity
            }

            # Print cluster details
            self.print_cluster_details(cluster_assignments, embedder_name)

            return result

        except Exception as e:
            print(f"Error with {embedder_name}: {str(e)}")
            import traceback
            traceback.print_exc()
            # Ensure we clean up even if there's an error
            self.cleanup_embedder()
            return None

    def create_interactive_plot(self, reduced_embeddings, clusters, embedder_name,
                               variance_ratio, n_clusters):
        """Create an interactive plot using Plotly"""
        # Create DataFrame for plotting
        plot_df = pd.DataFrame({
            'x': reduced_embeddings[:, 0],
            'y': reduced_embeddings[:, 1],
            'cluster': [f'Cluster {c}' for c in clusters],
            'filename': self.filenames,
            'content': [doc[:100] + '...' if len(doc) > 100 else doc for doc in self.documents]
        })

        # Create the plot
        fig = px.scatter(
            plot_df, x='x', y='y', color='cluster',
            hover_data=['filename', 'content'],
            title=f'{embedder_name} - {n_clusters} Clusters '
                  f'(PCA: {variance_ratio[0]*100:.1f}% + {variance_ratio[1]*100:.1f}% variance explained)',
            labels={'x': f'PCA Component 1 ({variance_ratio[0]*100:.1f}%)',
                    'y': f'PCA Component 2 ({variance_ratio[1]*100:.1f}%)'}
        )

        # Update layout for better readability
        fig.update_traces(
            marker=dict(size=12, line=dict(width=1, color='DarkSlateGrey')),
            selector=dict(mode='markers')
        )

        fig.update_layout(
            width=800,
            height=600,
            hovermode='closest'
        )

        fig.show()

    def print_cluster_details(self, cluster_assignments, embedder_name):
        """Print detailed cluster information"""
        print(f"\nCluster assignments for {embedder_name}:")
        print("-" * 60)

        for cluster_id in sorted(cluster_assignments.keys()):
            items = cluster_assignments[cluster_id]
            print(f"Cluster {cluster_id}:")
            print(f"  Number of documents: {len(items)}")
            print("  Documents:")
            for item in items:
                print(f"    - {item['filename']} (Index: {item['index']})")
            print()

    def analyze_all_embedders(self):
        """Run analysis for all configured embedders"""
        self.results = {}

        for embedder_name, embedder_config in EMBEDDERS.items():
            result = self.analyze_embedder(embedder_name, embedder_config)
            if result:
                self.results[embedder_name] = result

        # Compare embedders if we have multiple results
        if len(self.results) > 1:
            self.compare_embedders()

    def compare_embedders(self):
        """Compare results across different embedders"""
        print("\n" + "="*60)
        print("COMPARISON OF EMBEDDERS")
        print("="*60)

        comparison_data = []

        for embedder_name, result in self.results.items():
            comparison_data.append({
                'Embedder': embedder_name,
                'Clusters': result['optimal_clusters'],
                'Avg Intra-Cluster Similarity': result['avg_intra_similarity'],
                'Avg Inter-Cluster Similarity': result['avg_inter_similarity'],
                'Separation Score': result['avg_intra_similarity'] - result['avg_inter_similarity']
            })

        # Create comparison DataFrame
        comparison_df = pd.DataFrame(comparison_data)
        print(comparison_df.to_string(index=False))

        # Visual comparison
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Number of Clusters', 'Intra-Cluster Similarity',
                           'Inter-Cluster Similarity', 'Separation Score')
        )

        # Number of clusters
        fig.add_trace(
            go.Bar(x=comparison_df['Embedder'], y=comparison_df['Clusters'],
                  name='Clusters'),
            row=1, col=1
        )

        # Intra-cluster similarity
        fig.add_trace(
            go.Bar(x=comparison_df['Embedder'], y=comparison_df['Avg Intra-Cluster Similarity'],
                  name='Intra-Cluster'),
            row=1, col=2
        )

        # Inter-cluster similarity
        fig.add_trace(
            go.Bar(x=comparison_df['Embedder'], y=comparison_df['Avg Inter-Cluster Similarity'],
                  name='Inter-Cluster'),
            row=2, col=1
        )

        # Separation score
        fig.add_trace(
            go.Bar(x=comparison_df['Embedder'], y=comparison_df['Separation Score'],
                  name='Separation'),
            row=2, col=2
        )

        fig.update_layout(height=600, width=800, title_text="Embedder Comparison", showlegend=False)
        fig.show()

In [22]:
if __name__ == "__main__":
    # Set your data directory path here
    # For Google Colab, you might need to mount Google Drive first:
    # from google.colab import drive
    # drive.mount('/content/drive')

    # Example paths:
    data_dir = "/content/drive/MyDrive/Data/TextTopo/TXT/"  # For Google Colab
    # data_dir = "./data"  # For local execution

    # data_dir = "/path/to/your/text/files"  # CHANGE THIS TO YOUR ACTUAL PATH

    # Initialize and run the analysis
    analyzer = DocumentClusterAnalyzer(data_dir, use_gpu=True)
    documents, filenames = analyzer.load_documents()

    if not documents:
        print("No documents found. Please check the data directory path.")
        print("Current data directory:", data_dir)
        print("Make sure the directory exists and contains .txt files.")
    else:
        # Run analysis for all embedders
        analyzer.analyze_all_embedders()

Using device: cuda
Loaded 80 documents

Analyzing with BAAI/bge-large-en
Generating embeddings using BAAI/bge-large-en on cuda...
Embedding shape: (80, 1024)
Optimal number of clusters: 3



Cluster assignments for BAAI/bge-large-en:
------------------------------------------------------------
Cluster 0:
  Number of documents: 19
  Documents:
    - MHKY Passport MP ABD.txt (Index: 1)
    - MHKY Passport MP ABD Sp.txt (Index: 2)
    - MHIL MP Denial Notification.txt (Index: 6)
    - MHIL MP Denial Notification Sp.txt (Index: 7)
    - MHID MP Denial Notification.txt (Index: 8)
    - MHCA UM Marketplace.txt (Index: 17)
    - MHID MP Denial Notification Sp.txt (Index: 26)
    - MHFL MP Denial.txt (Index: 28)
    - MHOH Denial.txt (Index: 37)
    - MHNV MP Denial Letter.txt (Index: 38)
    - MHMI MP Pre Service Denial.txt (Index: 48)
    - MHMS Denial.txt (Index: 50)
    - MHWI MP Denial.txt (Index: 51)
    - MHWA MP Hospital EN.txt (Index: 57)
    - MHWI MP Denial SP.txt (Index: 65)
    - MHWA MP General EN.txt (Index: 69)
    - MHUT MP Denial.txt (Index: 70)
    - MHTX MP Adverse Detm EN SP.txt (Index: 72)
    - MHSC Denial.txt (Index: 76)

Cluster 1:
  Number of documents: 


Cluster assignments for all-MiniLM-L6-v2:
------------------------------------------------------------
Cluster 0:
  Number of documents: 22
  Documents:
    - MHKY MP Notice of Auth Sp.txt (Index: 5)
    - MHID MP Notice of Auth Sp.txt (Index: 9)
    - MHMI MP Approval Letter SP.txt (Index: 14)
    - MHCA MP Approval Letter SP.txt (Index: 18)
    - MHIL MP Notice of Auth Sp.txt (Index: 20)
    - MHFL MP Denial Sp.txt (Index: 22)
    - MHCA UM Marketplace Sp.txt (Index: 23)
    - MHFL MP Approval Letter SP.txt (Index: 24)
    - MHFL MP Notice of Auth Sp.txt (Index: 29)
    - MHNV MP Denial Letter ES.txt (Index: 30)
    - MHNV Admin Denial Letter ES.txt (Index: 32)
    - MHNM MP Extension Sp.txt (Index: 35)
    - MHTX MP Administrative Denial Sp.txt (Index: 39)
    - MHNV MP Approval Letter ES.txt (Index: 40)
    - MHNM Marketplace Denial Sp.txt (Index: 42)
    - MHNM Marketplace Admin Sp.txt (Index: 46)
    - MHNM MP Approval Letter SP.txt (Index: 47)
    - MHMI MP Pre Service Denial S


Install Nomic's megablocks fork for better speed: `pip install git+https://github.com/nomic-ai/megablocks.git`



Embedding shape: (80, 768)
Optimal number of clusters: 3



Cluster assignments for nomic-embed-text-v2-moe:
------------------------------------------------------------
Cluster 0:
  Number of documents: 33
  Documents:
    - MHFL MP Approval Letter.txt (Index: 0)
    - MHKY MP Notice of Authorization.txt (Index: 4)
    - MHKY MP Notice of Auth Sp.txt (Index: 5)
    - MHID MP Notice of Auth Sp.txt (Index: 9)
    - MHID MP Notice of Authorization.txt (Index: 10)
    - MHMI MP Notice of Auth Ab.txt (Index: 13)
    - MHMI MP Approval Letter SP.txt (Index: 14)
    - MHMI MP Approval Letter.txt (Index: 16)
    - MHCA MP Approval Letter SP.txt (Index: 18)
    - MHIL MP Notice of Authorization.txt (Index: 19)
    - MHIL MP Notice of Auth Sp.txt (Index: 20)
    - MHCA MP Approval Letter.txt (Index: 21)
    - MHFL MP Approval Letter SP.txt (Index: 24)
    - MHFL MP Notice of Auth Sp.txt (Index: 29)
    - MHNM MP Approval Letter.txt (Index: 33)
    - MHNV MP Approval Letter.txt (Index: 36)
    - MHNV MP Approval Letter ES.txt (Index: 40)
    - MHMS Noti