In [1]:
import numpy as np
import pandas as pd
import time
import psutil
import os
import mmap
import tempfile
from typing import List, Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')


In [5]:
# Core libraries
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import faiss
from datasketch import MinHashLSH, MinHash
import pickle
import json

In [8]:
try:
    import onnx
    import onnxruntime as ort
    from optimum.onnxruntime import ORTModelForFeatureExtraction
    from transformers import AutoTokenizer
    ONNX_AVAILABLE = True
except ImportError:
    print("ONNX libraries not available. Install with: pip install onnx onnxruntime optimum[onnxruntime]")
    ONNX_AVAILABLE = False


In [9]:
class PerformanceMonitor:
    """Monitor CPU, memory, and time performance"""
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.start_time = None
        self.end_time = None
        self.start_memory = None
        self.peak_memory = 0
        
    def start(self):
        self.start_time = time.time()
        self.start_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        self.peak_memory = self.start_memory
        
    def update_peak_memory(self):
        current_memory = psutil.Process().memory_info().rss / 1024 / 1024
        self.peak_memory = max(self.peak_memory, current_memory)
        
    def stop(self):
        self.end_time = time.time()
        self.update_peak_memory()
        
    def get_stats(self):
        return {
            'execution_time': self.end_time - self.start_time if self.end_time else 0,
            'memory_used': self.peak_memory - self.start_memory,
            'peak_memory_mb': self.peak_memory
        }

In [10]:
class DatasetGenerator:
    """Generate synthetic dataset for testing"""
    
    @staticmethod
    def create_product_dataset(n_samples=1000):
        """Create a product catalog dataset"""
        categories = ['Electronics', 'Clothing', 'Books', 'Home & Garden', 'Sports', 'Toys']
        brands = ['Apple', 'Samsung', 'Nike', 'Adidas', 'Sony', 'Microsoft', 'Amazon', 'Generic']
        
        products = []
        for i in range(n_samples):
            category = np.random.choice(categories)
            brand = np.random.choice(brands)
            
            if category == 'Electronics':
                items = ['Phone', 'Laptop', 'Tablet', 'Headphones', 'Camera', 'Smart Watch']
                features = ['4K', 'Wireless', 'Bluetooth', 'Fast Charging', 'Waterproof']
            elif category == 'Clothing':
                items = ['T-Shirt', 'Jeans', 'Shoes', 'Jacket', 'Dress', 'Hat']
                features = ['Cotton', 'Comfortable', 'Stylish', 'Durable', 'Breathable']
            else:
                items = ['Book', 'Novel', 'Guide', 'Manual', 'Dictionary']
                features = ['Bestseller', 'Educational', 'Popular', 'New Release', 'Classic']
            
            item = np.random.choice(items)
            feature = np.random.choice(features)
            
            # Create variations of the same product
            if np.random.random() < 0.3:  # 30% chance of creating similar product
                name = f"{brand} {item} {feature} Pro"
            else:
                name = f"{brand} {item} {feature}"
                
            description = f"High quality {item.lower()} from {brand} with {feature.lower()} technology"
            
            products.append({
                'id': f'PROD_{i:04d}',
                'name': name,
                'description': description,
                'category': category,
                'brand': brand,
                'price': np.random.uniform(10, 1000)
            })
        
        return pd.DataFrame(products)

In [1]:
class BaselineFuzzyMatcher:
    """Baseline implementation without optimizations"""
    
    def __init__(self):
        self.models = {
            'sentence_transformer': SentenceTransformer('all-MiniLM-L6-v2'),
            'tfidf': TfidfVectorizer(max_features=1000, stop_words='english'),
        }
        self.embeddings = {}
        
    def fit(self, texts: List[str]):
        """Fit models on texts"""
        # Sentence transformer embeddings
        self.embeddings['sentence_transformer'] = self.models['sentence_transformer'].encode(texts)
        
        # TF-IDF embeddings
        tfidf_matrix = self.models['tfidf'].fit_transform(texts)
        self.embeddings['tfidf'] = tfidf_matrix.toarray()
        
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches for query texts"""
        results = {}
        
        # Sentence transformer matching
        query_embeddings = self.models['sentence_transformer'].encode(query_texts)
        similarities = cosine_similarity(query_embeddings, self.embeddings['sentence_transformer'])
        results['sentence_transformer'] = [
            np.argsort(sim)[-top_k:][::-1] for sim in similarities
        ]
        
        # TF-IDF matching
        query_tfidf = self.models['tfidf'].transform(query_texts).toarray()
        similarities = cosine_similarity(query_tfidf, self.embeddings['tfidf'])
        results['tfidf'] = [
            np.argsort(sim)[-top_k:][::-1] for sim in similarities
        ]
        
        return results

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class ONNXOptimizedMatcher:
    """ONNX + Quantization optimization"""
    
    def __init__(self):
        self.model_path = None
        self.tokenizer = None
        self.ort_session = None
        self.embeddings = {}
        
    def convert_to_onnx(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        """Convert model to ONNX format"""
        if not ONNX_AVAILABLE:
            raise ImportError("ONNX libraries not available")
            
        try:
            # Load tokenizer and model with ONNX export
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = ORTModelForFeatureExtraction.from_pretrained(
                model_name, 
                export=True,
                # providers=["CPUExecutionProvider"]  # optional here
            )
            
            model_dir = f"./onnx_{model_name.replace('/', '_')}"
            model.save_pretrained(model_dir)
            self.model_path = f"{model_dir}/model.onnx"
            
            self.ort_session = ort.InferenceSession(
                self.model_path,
                providers=['CPUExecutionProvider']
            )
            
            return True
        except Exception as e:
            print(f"ONNX conversion failed: {e}")
            return False
    
    def encode_texts(self, texts: List[str]):
        if not self.ort_session:
            return None
            
        embeddings = []
        batch_size = 32
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                return_tensors="np",
                max_length=512
            )
            
            outputs = self.ort_session.run(
                None, 
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"]
                    "token_type_ids": inputs["token_type_ids"]
                }
            )
            
            batch_embeds = outputs[0].mean(axis=1)
            embeddings.extend(batch_embeds)  # correct way
            
        return np.array(embeddings)
    
    def fit(self, texts: List[str]):
        self.embeddings['onnx'] = self.encode_texts(texts)
        
    def find_matches(self, query_texts: List[str], top_k=5):
        query_embeddings = self.encode_texts(query_texts)
        if query_embeddings is None:
            return {}
        similarities = cosine_similarity(query_embeddings, self.embeddings['onnx'])
        return {
            'onnx': [np.argsort(sim)[-top_k:][::-1] for sim in similarities]
        }


In [13]:
class PrecomputedSimilarityMatcher:
    """Precomputed similarity matrices optimization"""
    
    def __init__(self):
        self.similarity_matrices = {}
        self.texts = []
        self.embeddings = {}
        
    def fit(self, texts: List[str]):
        """Precompute similarity matrices"""
        self.texts = texts
        
        # Use lightweight TF-IDF for precomputation
        vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # Precompute full similarity matrix
        self.similarity_matrices['tfidf'] = cosine_similarity(tfidf_matrix)
        
        # Store for query processing
        self.vectorizer = vectorizer
        
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using precomputed matrices"""
        results = []
        
        for query in query_texts:
            # Transform query
            query_vec = self.vectorizer.transform([query])
            
            # Compute similarity with all documents
            similarities = cosine_similarity(query_vec, self.vectorizer.transform(self.texts))[0]
            
            # Get top matches
            top_indices = np.argsort(similarities)[-top_k:][::-1]
            results.append(top_indices)
            
        return {'precomputed': results}

In [14]:
class LSHMatcher:
    """Locality Sensitive Hashing optimization"""
    
    def __init__(self, threshold=0.5, num_perm=128):
        self.threshold = threshold
        self.num_perm = num_perm
        self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
        self.minhashes = {}
        self.texts = []
        
    def _create_minhash(self, text: str):
        """Create MinHash for text"""
        minhash = MinHash(num_perm=self.num_perm)
        # Simple tokenization
        tokens = text.lower().split()
        for token in tokens:
            minhash.update(token.encode('utf8'))
        return minhash
    
    def fit(self, texts: List[str]):
        """Build LSH index"""
        self.texts = texts
        
        for i, text in enumerate(texts):
            minhash = self._create_minhash(text)
            self.minhashes[i] = minhash
            self.lsh.insert(i, minhash)
    
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using LSH"""
        results = []
        
        for query in query_texts:
            query_minhash = self._create_minhash(query)
            candidates = list(self.lsh.query(query_minhash))
            
            # If not enough candidates, pad with random indices
            if len(candidates) < top_k:
                remaining = top_k - len(candidates)
                available_indices = set(range(len(self.texts))) - set(candidates)
                candidates.extend(np.random.choice(list(available_indices), 
                                                 min(remaining, len(available_indices)), 
                                                 replace=False))
            
            results.append(candidates[:top_k])
            
        return {'lsh': results}

In [15]:
class MemoryMappedEmbeddings:
    """Memory-mapped embeddings for large datasets"""
    
    def __init__(self, embeddings_file, shape, dtype=np.float32):
        self.file = open(embeddings_file, 'r+b')
        self.mmap = mmap.mmap(self.file.fileno(), 0)
        self.embeddings = np.frombuffer(self.mmap, dtype=dtype).reshape(shape)
        self.shape = shape
        self.dtype = dtype
    
    def __getitem__(self, idx):
        return self.embeddings[idx].copy()
    
    def get_batch(self, indices):
        """Get multiple embeddings efficiently"""
        return np.array([self.embeddings[i] for i in indices])
    
    def close(self):
        if hasattr(self, 'embeddings'):
            self.embeddings = None
        if hasattr(self, 'mmap'):
            self.mmap.close()
        if hasattr(self, 'file'):
            self.file.close()

class MemoryMappedMatcher:
    """Memory mapping optimization for large embeddings"""
    
    def __init__(self):
        self.mmap_embeddings = None
        self.temp_files = []
        self.texts = []
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def fit(self, texts: List[str]):
        """Create memory-mapped embeddings"""
        self.texts = texts
        
        # Generate embeddings
        embeddings = self.model.encode(texts)
        
        # Save to temporary file for memory mapping
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mmap')
        temp_file.close()
        self.temp_files.append(temp_file.name)
        
        # Write embeddings to file
        embeddings.astype(np.float32).tofile(temp_file.name)
        
        # Create memory-mapped access
        self.mmap_embeddings = MemoryMappedEmbeddings(
            temp_file.name, 
            embeddings.shape, 
            dtype=np.float32
        )
    
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using memory-mapped embeddings"""
        if not self.mmap_embeddings:
            return {}
        
        # Encode queries normally (small dataset)
        query_embeddings = self.model.encode(query_texts)
        
        results = []
        for query_emb in query_embeddings:
            similarities = []
            
            # Process in batches to avoid loading all embeddings
            batch_size = 100
            for i in range(0, self.mmap_embeddings.shape[0], batch_size):
                end_idx = min(i + batch_size, self.mmap_embeddings.shape[0])
                batch_indices = list(range(i, end_idx))
                
                # Get batch of embeddings from memory map
                batch_embeddings = self.mmap_embeddings.get_batch(batch_indices)
                
                # Compute similarities for this batch
                batch_similarities = cosine_similarity([query_emb], batch_embeddings)[0]
                similarities.extend([(i + j, sim) for j, sim in enumerate(batch_similarities)])
            
            # Sort by similarity and get top_k
            similarities.sort(key=lambda x: x[1], reverse=True)
            top_indices = [idx for idx, _ in similarities[:top_k]]
            results.append(top_indices)
        
        return {'memory_mapped': results}
    
    def __del__(self):
        """Cleanup temporary files"""
        if hasattr(self, 'mmap_embeddings') and self.mmap_embeddings:
            self.mmap_embeddings.close()
        
        for temp_file in self.temp_files:
            try:
                os.unlink(temp_file)
            except:
                pass

In [16]:
class SparseEmbeddingMatcher:
    """Sparse embeddings optimization"""
    
    def __init__(self, sparsity_threshold=0.1):
        self.sparsity_threshold = sparsity_threshold
        self.sparse_embeddings = None
        self.texts = []
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def _sparsify(self, embeddings, threshold):
        """Convert dense embeddings to sparse by thresholding"""
        sparse = csr_matrix(embeddings)
        sparse.data[np.abs(sparse.data) < threshold] = 0
        sparse.eliminate_zeros()
        return sparse
    
    def _sparse_cosine_similarity(self, A, B):
        """Compute cosine similarity between sparse matrices"""
        # A·B / (||A|| * ||B||)
        dot_product = A @ B.T
        
        # Compute norms
        A_norm = np.sqrt(A.multiply(A).sum(axis=1))
        B_norm = np.sqrt(B.multiply(B).sum(axis=1))
        
        # Avoid division by zero
        A_norm = np.maximum(A_norm, 1e-8)
        B_norm = np.maximum(B_norm, 1e-8)
        
        # Compute cosine similarity
        similarity = dot_product.multiply(1 / A_norm).multiply(1 / B_norm.T)
        
        return similarity
    
    def fit(self, texts: List[str]):
        """Create sparse embeddings"""
        self.texts = texts
        
        # Generate dense embeddings
        dense_embeddings = self.model.encode(texts)
        
        # Convert to sparse
        self.sparse_embeddings = self._sparsify(dense_embeddings, self.sparsity_threshold)
        
        print(f"Sparsity: {1 - self.sparse_embeddings.nnz / np.prod(self.sparse_embeddings.shape):.2%}")
    
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using sparse embeddings"""
        if self.sparse_embeddings is None:
            return {}
        
        # Encode and sparsify queries
        query_dense = self.model.encode(query_texts)
        query_sparse = self._sparsify(query_dense, self.sparsity_threshold)
        
        # Compute sparse cosine similarity
        similarities = self._sparse_cosine_similarity(query_sparse, self.sparse_embeddings)
        
        # Convert to dense for easier processing
        similarities = similarities.toarray()
        
        results = []
        for sim_row in similarities:
            top_indices = np.argsort(sim_row)[-top_k:][::-1]
            results.append(top_indices)
        
        return {'sparse_embeddings': results}

In [17]:
class BatchProcessingMatcher:
    """Batch processing optimization for efficient computation"""
    
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.embeddings = None
        self.texts = []
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def fit(self, texts: List[str]):
        """Create embeddings using batch processing"""
        self.texts = texts
        
        # Process in batches to manage memory
        all_embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            batch_embeddings = self.model.encode(batch, show_progress_bar=False)
            all_embeddings.append(batch_embeddings)
        
        self.embeddings = np.vstack(all_embeddings)
    
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using batch processing"""
        if self.embeddings is None:
            return {}
        
        # Process queries in batches
        all_results = []
        for i in range(0, len(query_texts), min(self.batch_size, 16)):  # Smaller batch for queries
            batch_queries = query_texts[i:i + min(self.batch_size, 16)]
            batch_embeddings = self.model.encode(batch_queries, show_progress_bar=False)
            
            # Compute similarities for this batch
            similarities = cosine_similarity(batch_embeddings, self.embeddings)
            
            # Get top matches for each query in batch
            for sim_row in similarities:
                top_indices = np.argsort(sim_row)[-top_k:][::-1]
                all_results.append(top_indices)
        
        return {'batch_processing': all_results}

In [18]:
class UltimateCombinedMatcher:
    """Combined optimization using all six strategies"""
    
    def __init__(self):
        self.matchers = {}
        self.texts = []
        
        # Initialize all matchers
        if ONNX_AVAILABLE:
            self.matchers['onnx'] = ONNXOptimizedMatcher()
        self.matchers['precomputed'] = PrecomputedSimilarityMatcher()
        self.matchers['lsh'] = LSHMatcher()
        self.matchers['memory_mapped'] = MemoryMappedMatcher()
        self.matchers['sparse_embeddings'] = SparseEmbeddingMatcher()
        self.matchers['batch_processing'] = BatchProcessingMatcher()
    
    def fit(self, texts: List[str]):
        """Fit all optimized matchers"""
        self.texts = texts
        
        for name, matcher in self.matchers.items():
            try:
                if name == 'onnx' and hasattr(matcher, 'convert_to_onnx'):
                    if matcher.convert_to_onnx():
                        matcher.fit(texts)
                    else:
                        print(f"Skipping {name} due to conversion failure")
                        continue
                else:
                    matcher.fit(texts)
                print(f"✅ {name} fitted successfully")
            except Exception as e:
                print(f"❌ {name} failed: {e}")
    
    def find_matches(self, query_texts: List[str], top_k=5):
        """Find matches using ensemble of all methods"""
        all_results = {}
        
        # Get results from each method
        for name, matcher in self.matchers.items():
            try:
                results = matcher.find_matches(query_texts, top_k)
                all_results.update(results)
            except Exception as e:
                print(f"Error in {name}: {e}")
                continue
        
        # Advanced ensemble voting with weights
        ensemble_results = []
        method_weights = {
            'onnx': 0.2,
            'precomputed': 0.15,
            'lsh': 0.15,
            'memory_mapped': 0.2,
            'sparse_embeddings': 0.15,
            'batch_processing': 0.15
        }
        
        for i in range(len(query_texts)):
            vote_scores = {}
            
            for method, results in all_results.items():
                if i < len(results):
                    weight = method_weights.get(method, 0.1)
                    for rank, idx in enumerate(results[i][:top_k]):
                        # Higher rank = higher score, weighted by method importance
                        score = (top_k - rank) * weight
                        vote_scores[idx] = vote_scores.get(idx, 0) + score
            
            # Sort by weighted score and take top_k
            sorted_candidates = sorted(vote_scores.items(), key=lambda x: x[1], reverse=True)
            ensemble_results.append([idx for idx, _ in sorted_candidates[:top_k]])
        
        all_results['ultimate_ensemble'] = ensemble_results
        return all_results

In [35]:
import numpy as np

def convert_ndarray(obj):
    if isinstance(obj, dict):
        return {k: convert_ndarray(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_ndarray(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

In [2]:
import numpy as np
import psutil
import time
import tracemalloc

def run_comprehensive_comparison(dataset_path="alethia/notebooks/Amazon-Google", split="train.csv"):
    from alethia.datasets import DatasetLoader

    print("🚀 COMPREHENSIVE ML FUZZY MATCHING OPTIMIZATION COMPARISON")
    print("=" * 70)
    print("📊 Loading real benchmark dataset...")
    
    # Load real data
    texts, query_texts, ground_truth = DatasetLoader.load_amazon_google(
        dataset_path=dataset_path, split=split
    )
    
    print(f"Dataset size: {len(texts)} items")
    print(f"Query size: {len(query_texts)} queries\n")

    matchers = {
        'baseline': BaselineFuzzyMatcher(),
        'onnx': ONNXOptimizedMatcher(),
        'precomputed': PrecomputedSimilarityMatcher(),
        'lsh': LSHMatcher(),
        'memory_mapped': MemoryMappedMatcher(),
        'sparse_embeddings': SparseEmbeddingMatcher(),
        'batch_processing': BatchProcessingMatcher(),
        'ultimate_combined': UltimateCombinedMatcher()
    }

    results = {}

    for name, matcher in matchers.items():
        print(f"🔧 Testing {name.upper()} approach...")

        start = time.time()
        tracemalloc.start()

        try:
            if name == 'onnx':
                if not matcher.convert_to_onnx():
                    raise RuntimeError("ONNX conversion failed")
            matcher.fit(texts)
            matches = matcher.find_matches(query_texts, top_k=5)

            current, peak = tracemalloc.get_traced_memory()
            end = time.time()

            avg_matches = np.mean([len(m) for m in list(matches.values())[0]])

            results[name] = {
                'success': True,
                'time': round(end - start, 4),
                'memory': round(current / 1024 / 1024, 2),
                'peak_memory': round(peak / 1024 / 1024, 2),
                'matches': matches,
                'avg_matches': avg_matches
            }

            print(f"   ✅ Time: {results[name]['time']}s")
            print(f"   ✅ Memory: {results[name]['memory']}MB")
            print(f"   ✅ Peak Memory: {results[name]['peak_memory']}MB\n")

        except Exception as e:
            current, peak = tracemalloc.get_traced_memory()
            end = time.time()
            results[name] = {
                'success': False,
                'error': str(e),
                'time': round(end - start, 4),
                'memory': round(current / 1024 / 1024, 2),
                'peak_memory': round(peak / 1024 / 1024, 2),
            }
            print(f"   ❌ Failed: {e}\n")

        tracemalloc.stop()

    # Generate Rankings
    success_results = {k: v for k, v in results.items() if v['success']}
    print("\n📈 COMPREHENSIVE COMPARISON RESULTS")
    print("=" * 70)

    # Rankings
    speed_ranking = sorted(success_results.items(), key=lambda x: x[1]['time'])
    memory_ranking = sorted(success_results.items(), key=lambda x: x[1]['memory'])

    print("\n🏆 PERFORMANCE RANKINGS:\n----------------------------------------")
    print("⚡ SPEED RANKING (fastest to slowest):")
    for i, (name, r) in enumerate(speed_ranking, 1):
        print(f"   {i}. {name.upper():<20}: {r['time']}s")

    print("\n🧠 MEMORY RANKING (most to least efficient):")
    for i, (name, r) in enumerate(memory_ranking, 1):
        print(f"   {i}. {name.upper():<20}: {r['memory']}MB")

    # Match sample output
    print("\n🎯 MATCH QUALITY ANALYSIS:\n----------------------------------------")
    for name, r in success_results.items():
        print(f"{name.upper():<20}: {r['avg_matches']:.2f} avg matches")
        sample_match = list(r['matches'].values())[0][0:5] if r['matches'] else []
        print(f"  Sample match: {sample_match}...")

    # Strategy recommendations
    best_speed = speed_ranking[0][0]
    best_memory = memory_ranking[0][0]

    print("\n🌟 RECOMMENDATIONS:\n----------------------------------------")
    print(f"• For SPEED: {best_speed.upper()} ({success_results[best_speed]['time']}s)")
    print(f"• For MEMORY: {best_memory.upper()} ({success_results[best_memory]['memory']}MB)")

    return results


In [3]:
# Additional utility functions for the benchmark
def compare_specific_strategies(strategy_names, dataset_size=1000):
    """Compare only specific strategies with custom dataset size"""
    
    print(f"🎯 TARGETED STRATEGY COMPARISON")
    print(f"Strategies: {', '.join(strategy_names)}")
    print(f"Dataset size: {dataset_size}")
    print("=" * 50)
    
    # Generate smaller dataset
    df = DatasetGenerator.create_product_dataset(n_samples=dataset_size)
    texts = (df['name'] + ' ' + df['description']).tolist()
    
    query_texts = [
        "Apple iPhone wireless",
        "Nike running shoes",
        "Samsung TV 4K"
    ]
    
    # Initialize only selected matchers
    all_matchers = {
        'baseline': BaselineFuzzyMatcher(),
        'onnx': ONNXOptimizedMatcher() if ONNX_AVAILABLE else None,
        'precomputed': PrecomputedSimilarityMatcher(),
        'lsh': LSHMatcher(),
        'memory_mapped': MemoryMappedMatcher(),
        'sparse_embeddings': SparseEmbeddingMatcher(),
        'batch_processing': BatchProcessingMatcher(),
        'ultimate_combined': UltimateCombinedMatcher()
    }
    
    selected_matchers = {name: matcher for name, matcher in all_matchers.items() 
                        if name in strategy_names and matcher is not None}
    
    if not selected_matchers:
        print("❌ No valid strategies found!")
        return {}
    
    results = {}
    
    for name, matcher in selected_matchers.items():
        print(f"🔧 Testing {name.upper()}...")
        monitor = PerformanceMonitor()
        
        try:
            monitor.start()
            matcher.fit(texts)
            matches = matcher.find_matches(query_texts, top_k=3)
            monitor.stop()
            
            stats = monitor.get_stats()
            results[name] = {
                'execution_time': stats['execution_time'],
                'memory_used': stats['memory_used'],
                'matches': len(matches),
                'success': True
            }
            
            print(f"   ✅ Time: {stats['execution_time']:.2f}s, Memory: {stats['memory_used']:.1f}MB")
            
        except Exception as e:
            print(f"   ❌ Failed: {str(e)}")
            results[name] = {'success': False, 'error': str(e)}
    
    return results


def benchmark_scaling_performance():
    """Test how strategies scale with dataset size"""
    
    print("📈 SCALING PERFORMANCE ANALYSIS")
    print("=" * 40)
    
    dataset_sizes = [100, 500, 1000, 2000]
    strategies = ['baseline', 'lsh', 'sparse_embeddings']
    
    scaling_results = {}
    
    for size in dataset_sizes:
        print(f"\n📊 Testing with {size} samples...")
        scaling_results[size] = compare_specific_strategies(strategies, size)
    
    # Analyze scaling trends
    print(f"\n📈 SCALING ANALYSIS:")
    print("-" * 30)
    
    for strategy in strategies:
        print(f"\n{strategy.upper()} Scaling:")
        times = []
        memories = []
        
        for size in dataset_sizes:
            if size in scaling_results and strategy in scaling_results[size]:
                result = scaling_results[size][strategy]
                if result['success']:
                    times.append(result['execution_time'])
                    memories.append(result['memory_used'])
                    print(f"  {size:>4} samples: {result['execution_time']:.2f}s, {result['memory_used']:.1f}MB")
        
        # Calculate scaling factor
        if len(times) >= 2:
            time_scaling = times[-1] / times[0] if times[0] > 0 else float('inf')
            memory_scaling = memories[-1] / memories[0] if memories[0] > 0 else float('inf')
            print(f"  Scaling factor: Time {time_scaling:.2f}x, Memory {memory_scaling:.2f}x")
    
    return scaling_results

In [22]:
# Test just a few strategies with smaller dataset
quick_results = compare_specific_strategies(['baseline', 'lsh'], dataset_size=500)

🎯 TARGETED STRATEGY COMPARISON
Strategies: baseline, lsh
Dataset size: 500
🔧 Testing BASELINE...
   ✅ Time: 0.68s, Memory: 393.0MB
🔧 Testing LSH...
   ✅ Time: 0.64s, Memory: 1.2MB


In [24]:
# Choose exactly what you want to test
my_strategies = ['baseline', 'lsh', 'sparse_embeddings']  # Your choice
my_dataset_size = 800  # Your choice
custom_results = compare_specific_strategies(my_strategies, my_dataset_size)

🎯 TARGETED STRATEGY COMPARISON
Strategies: baseline, lsh, sparse_embeddings
Dataset size: 800
🔧 Testing BASELINE...
   ✅ Time: 0.33s, Memory: 3.6MB
🔧 Testing LSH...
   ✅ Time: 1.00s, Memory: 1.7MB
🔧 Testing SPARSE_EMBEDDINGS...
Sparsity: 94.84%
   ✅ Time: 0.38s, Memory: 8.9MB


In [33]:
from datetime import datetime
import json
import numpy as np
import pandas as pd


In [None]:
full_results = run_comprehensive_comparison()