In [None]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import umap.umap_ as umap

In [None]:
EMBEDDERS = {
    "BAAI/bge-large-en": {"type": "huggingface", "model": "BAAI/bge-large-en"},
    "all-MiniLM-L6-v2": {"type": "sentence-transformers", "model": "all-MiniLM-L6-v2"},
    "nomic-embed-text-v2-moe": {"type": "huggingface", "model": "nomic-ai/nomic-embed-text-v2-moe", "trust_remote_code": True}
}

In [None]:
def get_device(use_gpu=True):
    """Determine the best device to use"""
    if use_gpu and torch.cuda.is_available():
        return torch.device("cuda")
    elif use_gpu and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device("mps")  # For Apple Silicon
    else:
        return torch.device("cpu")

In [None]:
def load_documents(data_dir):
    """Load all text files from the specified directory"""
    documents = []
    filenames = []

    if not data_dir or not os.path.exists(data_dir):
        print(f"Directory {data_dir} does not exist.")
        return documents, filenames

    txt_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
    if not txt_files:
        print(f"No .txt files found in {data_dir}")
        return documents, filenames

    for filename in txt_files:
        filepath = os.path.join(data_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read().strip()
                if content:  # Only add non-empty files
                    documents.append(content)
                    filenames.append(filename)
        except Exception as e:
            print(f"Error reading {filename}: {str(e)}")

    print(f"Loaded {len(documents)} documents")
    return documents, filenames

In [None]:
def mean_pooling(model_output, attention_mask):
    """Mean pooling for Hugging Face models"""
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def get_embeddings_huggingface(texts, model_name, device, trust_remote_code=False):
    """Get embeddings using Hugging Face models with GPU support"""
    if not texts:
        return np.array([])

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(
        model_name,
        trust_remote_code=trust_remote_code,
        torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
    ).to(device)

    batch_size = 16 if device.type == "cuda" else 8
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = mean_pooling(outputs, inputs['attention_mask']).cpu().numpy()
        embeddings.append(batch_embeddings)

    del model
    del tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return np.vstack(embeddings) if embeddings else np.array([])

In [None]:
def get_embeddings_sentence_transformers(texts, model_name, device):
    """Get embeddings using SentenceTransformers with GPU support"""
    if not texts:
        return np.array([])

    from sentence_transformers import SentenceTransformer

    st_device = "cuda" if device.type == "cuda" else "cpu"
    model = SentenceTransformer(model_name, device=st_device)

    batch_size = 64 if device.type == "cuda" else 32
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            device=st_device
        )
        embeddings.append(batch_embeddings)

    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return np.vstack(embeddings) if embeddings else np.array([])

In [None]:
def get_embeddings(embedder_name, embedder_config, texts, device):
    """Get embeddings based on embedder type"""
    print(f"Generating embeddings using {embedder_name} on {device}...")

    if not texts:
        print("No documents to process")
        return np.array([])

    if embedder_config["type"] == "huggingface":
        trust_remote_code = embedder_config.get("trust_remote_code", False)
        return get_embeddings_huggingface(texts, embedder_config["model"], device, trust_remote_code)
    elif embedder_config["type"] == "sentence-transformers":
        return get_embeddings_sentence_transformers(texts, embedder_config["model"], device)
    else:
        raise ValueError(f"Unknown embedder type: {embedder_config['type']}")

In [None]:
def find_optimal_clusters(embeddings, max_clusters=10):
    """Find optimal number of clusters using elbow method and silhouette analysis for KMeans"""
    if len(embeddings) <= 2:
        return 1

    max_clusters = min(max_clusters, len(embeddings) - 1)
    wcss = []
    silhouette_scores = []

    k_range = range(1, max_clusters + 1)

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(embeddings)
        wcss.append(kmeans.inertia_)

        if k > 1:
            try:
                silhouette_scores.append(silhouette_score(embeddings, clusters))
            except:
                silhouette_scores.append(0)
        else:
            silhouette_scores.append(0)

    # Elbow point
    if len(wcss) > 2:
        deltas = np.diff(wcss)
        derivatives = np.diff(deltas)
        if len(derivatives) > 0:
            elbow_point = np.argmin(derivatives) + 2
        else:
            elbow_point = 2
    else:
        elbow_point = 2

    # Silhouette
    if len(silhouette_scores) > 1:
        optimal_k_silhouette = np.argmax(silhouette_scores[1:]) + 2
    else:
        optimal_k_silhouette = 1

    optimal_k = max(1, min(elbow_point, optimal_k_silhouette))

    # Plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    ax1.plot(k_range, wcss, 'bo-')
    ax1.set_xlabel('Number of clusters')
    ax1.set_ylabel('WCSS')
    ax1.set_title('Elbow Method')
    ax1.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k: {optimal_k}')
    ax1.legend()

    ax2.plot(k_range, silhouette_scores, 'bo-')
    ax2.set_xlabel('Number of clusters')
    ax2.set_ylabel('Silhouette Score')
    ax2.set_title('Silhouette Analysis')
    ax2.axvline(x=optimal_k, color='r', linestyle='--', label=f'Optimal k: {optimal_k}')
    ax2.legend()

    plt.tight_layout()
    plt.show()

    return optimal_k

In [None]:
def perform_clustering(embeddings, clustering_method, clustering_params):
    """Perform clustering based on the selected method"""
    if clustering_method == 'kmeans':
        n_clusters = clustering_params.get('n_clusters', 8)
        clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    elif clustering_method == 'dbscan':
        eps = clustering_params.get('eps', 0.5)
        min_samples = clustering_params.get('min_samples', 5)
        clusterer = DBSCAN(eps=eps, min_samples=min_samples)
    elif clustering_method == 'agg':
        n_clusters = clustering_params.get('n_clusters', 8)
        clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    elif clustering_method == 'gmm':
        n_components = clustering_params.get('n_components', clustering_params.get('n_clusters', 8))
        clusterer = GaussianMixture(n_components=n_components, random_state=42)
    else:
        raise ValueError(f"Unknown clustering method: {clustering_method}")

    clusters = clusterer.fit_predict(embeddings)
    return clusters

In [None]:
def perform_dim_reduction(embeddings, dim_red_method, n_components=2):
    """Perform dimensionality reduction for visualization"""
    if dim_red_method == 'pca':
        reducer = PCA(n_components=n_components)
        reduced = reducer.fit_transform(embeddings)
        variance_ratio = reducer.explained_variance_ratio_
    elif dim_red_method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=42)
        reduced = reducer.fit_transform(embeddings)
        variance_ratio = [None, None]
    elif dim_red_method == 'umap':
        reducer = umap.UMAP(n_components=n_components, random_state=42)
        reduced = reducer.fit_transform(embeddings)
        variance_ratio = [None, None]
    else:
        raise ValueError(f"Unknown dim reduction method: {dim_red_method}")

    return reduced, variance_ratio

In [None]:
def create_interactive_plot(reduced_embeddings, clusters, embedder_name, variance_ratio, num_clusters, filenames, documents, dim_red_method):
    """Create an interactive plot using Plotly"""
    plot_df = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'cluster': [f'Cluster {c}' for c in clusters],
        'filename': filenames,
        'content': [doc[:100] + '...' if len(doc) > 100 else doc for doc in documents]
    })

    if variance_ratio[0] is not None:
        title = f'{embedder_name} - {num_clusters} Clusters (PCA: {variance_ratio[0]*100:.1f}% + {variance_ratio[1]*100:.1f}% variance explained)'
        labels = {'x': f'PCA Component 1 ({variance_ratio[0]*100:.1f}%)', 'y': f'PCA Component 2 ({variance_ratio[1]*100:.1f}%)'}
    else:
        title = f'{embedder_name} - {num_clusters} Clusters ({dim_red_method.upper()})'
        labels = {'x': f'{dim_red_method.upper()} Component 1', 'y': f'{dim_red_method.upper()} Component 2'}

    fig = px.scatter(
        plot_df, x='x', y='y', color='cluster',
        hover_data=['filename', 'content'],
        title=title,
        labels=labels
    )

    fig.update_traces(marker=dict(size=12, line=dict(width=1, color='DarkSlateGrey')))
    fig.update_layout(width=800, height=600, hovermode='closest')
    fig.show()

In [None]:
def print_cluster_details(cluster_assignments, embedder_name):
    """Print detailed cluster information"""
    print(f"\nCluster assignments for {embedder_name}:")
    print("-" * 60)

    for cluster_id in sorted(cluster_assignments.keys()):
        items = cluster_assignments[cluster_id]
        print(f"Cluster {cluster_id}:")
        print(f"  Number of documents: {len(items)}")
        print("  Documents:")
        for item in items:
            print(f"    - {item['filename']} (Index: {item['index']})")
        print()

In [None]:
def analyze_embedder(embedder_name, embedder_config, documents, filenames, device, clustering_method, dim_red_method, clustering_params):
    """Complete analysis for a single embedder"""
    print(f"\n{'='*60}")
    print(f"Analyzing with {embedder_name}")
    print(f"{'='*60}")

    try:
        embeddings = get_embeddings(embedder_name, embedder_config, documents, device)
        if embeddings.size == 0:
            print("No embeddings generated")
            return None

        print(f"Embedding shape: {embeddings.shape}")

        if len(documents) <= 1:
            print("Not enough documents for clustering")
            return None

        local_params = clustering_params.copy()
        if clustering_method == 'kmeans' and local_params.get('n_clusters') is None:
            optimal_clusters = find_optimal_clusters(embeddings, max_clusters=10)
            local_params['n_clusters'] = optimal_clusters
        else:
            optimal_clusters = local_params.get('n_clusters')  # May be None for some methods

        print(f"Number of clusters: {optimal_clusters if optimal_clusters else 'Auto (e.g., DBSCAN)'}")

        clusters = perform_clustering(embeddings, clustering_method, local_params)
        reduced_embeddings, variance_ratio = perform_dim_reduction(embeddings, dim_red_method)

        # Calculate actual number of clusters (excluding noise for DBSCAN)
        unique_clusters = set(clusters)
        actual_num_clusters = len(unique_clusters) - (1 if -1 in unique_clusters else 0)

        # Cluster assignments
        cluster_assignments = {}
        for i, cluster_id in enumerate(clusters):
            if cluster_id not in cluster_assignments:
                cluster_assignments[cluster_id] = []
            cluster_assignments[cluster_id].append({
                'index': i,
                'filename': filenames[i],
                'embedding': reduced_embeddings[i]
            })

        # Similarities
        similarity_matrix = cosine_similarity(embeddings)
        intra_cluster_similarities = []
        inter_cluster_similarities = []

        for cluster_id, items in cluster_assignments.items():
            indices = [item['index'] for item in items]

            if len(indices) > 1:
                cluster_similarities = similarity_matrix[np.ix_(indices, indices)]
                np.fill_diagonal(cluster_similarities, 0)
                intra_sum = np.sum(cluster_similarities) / (len(indices) * (len(indices) - 1))
                intra_cluster_similarities.append(intra_sum)

            other_indices = [i for i in range(len(embeddings)) if i not in indices]
            if other_indices:
                inter_similarities = similarity_matrix[np.ix_(indices, other_indices)]
                inter_sum = np.mean(inter_similarities)
                inter_cluster_similarities.append(inter_sum)

        avg_intra_similarity = np.mean(intra_cluster_similarities) if intra_cluster_similarities else 0
        avg_inter_similarity = np.mean(inter_cluster_similarities) if inter_cluster_similarities else 0

        create_interactive_plot(
            reduced_embeddings, clusters, embedder_name,
            variance_ratio, actual_num_clusters, filenames, documents, dim_red_method
        )

        result = {
            'embeddings': embeddings,
            'reduced_embeddings': reduced_embeddings,
            'clusters': clusters,
            'cluster_assignments': cluster_assignments,
            'variance_ratio': variance_ratio,
            'optimal_clusters': optimal_clusters,
            'avg_intra_similarity': avg_intra_similarity,
            'avg_inter_similarity': avg_inter_similarity
        }

        print_cluster_details(cluster_assignments, embedder_name)

        return result

    except Exception as e:
        print(f"Error with {embedder_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
def analyze_all_embedders(embedders, documents, filenames, device, clustering_method, dim_red_method, clustering_params):
    """Run analysis for all configured embedders"""
    results = {}

    for embedder_name, embedder_config in embedders.items():
        result = analyze_embedder(
            embedder_name, embedder_config, documents, filenames, device,
            clustering_method, dim_red_method, clustering_params
        )
        if result:
            results[embedder_name] = result

    if len(results) > 1:
        compare_embedders(results)

    return results

In [None]:
def compare_embedders(results):
    """Compare results across different embedders"""
    print("\n" + "="*60)
    print("COMPARISON OF EMBEDDERS")
    print("="*60)

    comparison_data = []

    for embedder_name, result in results.items():
        comparison_data.append({
            'Embedder': embedder_name,
            'Clusters': result['optimal_clusters'],
            'Avg Intra-Cluster Similarity': result['avg_intra_similarity'],
            'Avg Inter-Cluster Similarity': result['avg_inter_similarity'],
            'Separation Score': result['avg_intra_similarity'] - result['avg_inter_similarity']
        })

    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Number of Clusters', 'Intra-Cluster Similarity',
                       'Inter-Cluster Similarity', 'Separation Score')
    )

    fig.add_trace(go.Bar(x=comparison_df['Embedder'], y=comparison_df['Clusters'], name='Clusters'), row=1, col=1)
    fig.add_trace(go.Bar(x=comparison_df['Embedder'], y=comparison_df['Avg Intra-Cluster Similarity'], name='Intra-Cluster'), row=1, col=2)
    fig.add_trace(go.Bar(x=comparison_df['Embedder'], y=comparison_df['Avg Inter-Cluster Similarity'], name='Inter-Cluster'), row=2, col=1)
    fig.add_trace(go.Bar(x=comparison_df['Embedder'], y=comparison_df['Separation Score'], name='Separation'), row=2, col=2)

    fig.update_layout(height=600, width=800, title_text="Embedder Comparison", showlegend=False)
    fig.show()

In [None]:
if __name__ == "__main__":
    # Set your data directory path here
    data_dir = "../Data/TXT/Sorted/English"  # CHANGE THIS TO YOUR ACTUAL PATH

    # Configure clustering and dimensionality reduction
    clustering_method = 'kmeans'  # Options: 'kmeans', 'dbscan', 'agg', 'gmm'
    dim_red_method = 'pca'  # Options: 'pca', 'tsne', 'umap'
    clustering_params = {'n_clusters': None}  # Set to None for auto (kmeans only); for dbscan: {'eps': 0.5, 'min_samples': 5}

    use_gpu = True
    device = get_device(use_gpu)
    print(f"Using device: {device}")

    documents, filenames = load_documents(data_dir)

    if not documents:
        print("No documents found. Please check the data directory path.")
        print("Current data directory:", data_dir)
        print("Make sure the directory exists and contains .txt files.")
    else:
        analyze_all_embedders(EMBEDDERS, documents, filenames, device, clustering_method, dim_red_method, clustering_params)