**Setup and Installation**

In [4]:
!pip install -q langchain
!pip install -q langchain_community
!pip install -q transformers
!pip install -q torch
!pip install -q numpy
!pip install -q scikit-learn
!pip install -q sentence-transformers

**Import required libraries**

In [5]:
import time
import numpy as np
from typing import List, Dict, Any
import torch
from langchain_core.embeddings import Embeddings
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

**Basic Custom Embeddings**

In [None]:
#First, let's implement a simple custom embedding model that demonstrates the basic structure:

class SimpleCustomEmbeddings(Embeddings):
    """A simple custom embedding model for demonstration."""

    def __init__(self, dimension: int = 512):
        self.dimension = dimension
        self.metrics: Dict[str, List[Any]] = {
            'processing_times': [],
            'text_lengths': []
        }

    def _validate_input(self, text: str) -> str:
        """Validate and clean input text."""
        if not isinstance(text, str):
            raise ValueError("Input must be a string")
        cleaned_text = text.strip()
        if not cleaned_text:
            raise ValueError("Input text cannot be empty")
        return cleaned_text

    def _compute_embedding(self, text: str) -> List[float]:
        """Compute a deterministic embedding based on text characteristics."""
        # Create a simple hash of the text
        hash_value = sum(ord(c) for c in text)
        # Ensure seed is within valid range (0 to 2**32 - 1)
        seed = hash_value % (2**32 - 1)

        # Use the seed to generate a deterministic embedding
        np.random.seed(seed)
        embedding = np.random.uniform(-1, 1, self.dimension)

        # Normalize the embedding
        embedding = embedding / np.linalg.norm(embedding)
        return embedding.tolist()

    def _monitor_performance(self, text: str, start_time: float):
        """Monitor embedding generation performance."""
        end_time = time.time()
        self.metrics['processing_times'].append(end_time - start_time)
        self.metrics['text_lengths'].append(len(text))

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a list of documents."""
        embeddings = []
        for text in texts:
            start_time = time.time()
            validated_text = self._validate_input(text)
            embedding = self._compute_embedding(validated_text)
            self._monitor_performance(text, start_time)
            embeddings.append(embedding)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Generate embedding for a single query text."""
        start_time = time.time()
        validated_text = self._validate_input(text)
        embedding = self._compute_embedding(validated_text)
        self._monitor_performance(text, start_time)
        return embedding

"""Let's test our simple custom embeddings:"""

# Initialize the simple custom embeddings
simple_embedder = SimpleCustomEmbeddings(dimension=64)

# Test texts
test_texts = [
    "Machine learning is fascinating",
    "AI is transforming industries",
    "Neural networks are powerful",
    "Data science is the future"
]

print("Testing Simple Custom Embeddings:")
print("\nGenerating embeddings for multiple documents...")
doc_embeddings = simple_embedder.embed_documents(test_texts)

print("\nGenerating embedding for a single query...")
query_embedding = simple_embedder.embed_query("What is machine learning?")

print("\nEmbedding Statistics:")
print(f"Document embedding dimension: {len(doc_embeddings[0])}")
print(f"Query embedding dimension: {len(query_embedding)}")

# Print performance metrics
print("\nPerformance Metrics:")
avg_time = np.mean(simple_embedder.metrics['processing_times'])
avg_length = np.mean(simple_embedder.metrics['text_lengths'])
print(f"Average processing time: {avg_time:.4f} seconds")
print(f"Average text length: {avg_length:.1f} characters")


**Transformer-Based Custom Embeddings**

In [None]:
#Now let's implement a more sophisticated embedding model using transformers:

class TransformerCustomEmbeddings(Embeddings):
    """Custom embeddings using transformer models."""

    def __init__(self, model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        self.model = SentenceTransformer(model_name)
        self.metrics = {
            'processing_times': [],
            'text_lengths': []
        }

    def _validate_input(self, text: str) -> str:
        """Validate and clean input text."""
        if not isinstance(text, str):
            raise ValueError("Input must be a string")
        cleaned_text = text.strip()
        if not cleaned_text:
            raise ValueError("Input text cannot be empty")
        return cleaned_text

    def _compute_embedding(self, text: str) -> List[float]:
        """Compute embedding using the transformer model."""
        embedding = self.model.encode(text)
        return embedding.tolist()

    def _monitor_performance(self, text: str, start_time: float):
        """Monitor embedding generation performance."""
        end_time = time.time()
        self.metrics['processing_times'].append(end_time - start_time)
        self.metrics['text_lengths'].append(len(text))

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a list of documents."""
        start_time = time.time()
        validated_texts = [self._validate_input(text) for text in texts]
        embeddings = self.model.encode(validated_texts)

        for text in texts:
            self._monitor_performance(text, start_time)

        return embeddings.tolist()

    def embed_query(self, text: str) -> List[float]:
        """Generate embedding for a single query text."""
        start_time = time.time()
        validated_text = self._validate_input(text)
        embedding = self._compute_embedding(validated_text)
        self._monitor_performance(text, start_time)
        return embedding

"""Test the transformer-based embeddings:"""

print("Testing Transformer-Based Embeddings:")
try:
    # Initialize the transformer-based embeddings
    transformer_embedder = TransformerCustomEmbeddings()

    print("\nGenerating embeddings using transformer model...")
    transformer_doc_embeddings = transformer_embedder.embed_documents(test_texts)
    transformer_query_embedding = transformer_embedder.embed_query("What is machine learning?")

    print("\nEmbedding Statistics:")
    print(f"Document embedding dimension: {len(transformer_doc_embeddings[0])}")
    print(f"Query embedding dimension: {len(transformer_query_embedding)}")

    # Print performance metrics
    print("\nPerformance Metrics:")
    avg_time = np.mean(transformer_embedder.metrics['processing_times'])
    avg_length = np.mean(transformer_embedder.metrics['text_lengths'])
    print(f"Average processing time: {avg_time:.4f} seconds")
    print(f"Average text length: {avg_length:.1f} characters")
except Exception as e:
    print(f"Error testing transformer embeddings: {str(e)}")

**Domain-Specific Custom Embeddings**

In [None]:
#Let's implement a domain-specific embedding model that combines multiple embedding sources:

class DomainSpecificEmbeddings(Embeddings):
    """Domain-specific embeddings combining multiple sources."""

    def __init__(self, base_model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        self.base_model = SentenceTransformer(base_model_name)
        self.metrics = {
            'processing_times': [],
            'text_lengths': []
        }

        # Domain-specific vocabulary (example)
        self.domain_vocab = {
            'ml': 'machine learning',
            'ai': 'artificial intelligence',
            'dl': 'deep learning',
            'nn': 'neural network'
        }

    def _preprocess_text(self, text: str) -> str:
        """Apply domain-specific preprocessing."""
        text = text.lower()
        for abbrev, full in self.domain_vocab.items():
            text = text.replace(f" {abbrev} ", f" {full} ")
        return text

    def _enhance_embedding(self, base_embedding: List[float], text: str) -> List[float]:
        """Enhance base embedding with domain-specific features."""
        # Simple example: Adjust embeddings based on domain term presence
        embedding = np.array(base_embedding)

        # Count domain terms
        domain_term_count = sum(1 for term in self.domain_vocab.values()
                              if term in text.lower())

        # Slightly adjust embedding based on domain term presence
        if domain_term_count > 0:
            adjustment = 0.1 * domain_term_count
            embedding = embedding * (1 + adjustment)
            embedding = embedding / np.linalg.norm(embedding)

        return embedding.tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate domain-aware embeddings for documents."""
        start_time = time.time()

        # Preprocess texts
        processed_texts = [self._preprocess_text(text) for text in texts]

        # Get base embeddings
        base_embeddings = self.base_model.encode(processed_texts)

        # Enhance embeddings
        enhanced_embeddings = [
            self._enhance_embedding(emb.tolist(), text)
            for emb, text in zip(base_embeddings, texts)
        ]

        for text in texts:
            self._monitor_performance(text, start_time)

        return enhanced_embeddings

    def embed_query(self, text: str) -> List[float]:
        """Generate domain-aware embedding for query."""
        start_time = time.time()
        processed_text = self._preprocess_text(text)
        base_embedding = self.base_model.encode(processed_text)
        enhanced_embedding = self._enhance_embedding(base_embedding.tolist(), text)
        self._monitor_performance(text, start_time)
        return enhanced_embedding

"""Test the domain-specific embeddings:"""

print("Testing Domain-Specific Embeddings:")
try:
    # Initialize the domain-specific embeddings
    domain_embedder = DomainSpecificEmbeddings()

    # Test with domain-specific texts
    domain_texts = [
        "ML and DL are advancing rapidly",
        "AI is transforming industries",
        "NNs are the foundation of deep learning",
        "The future of ML looks promising"
    ]

    print("\nGenerating domain-specific embeddings...")
    domain_doc_embeddings = domain_embedder.embed_documents(domain_texts)
    domain_query_embedding = domain_embedder.embed_query("What is ML and AI?")

    print("\nEmbedding Statistics:")
    print(f"Document embedding dimension: {len(domain_doc_embeddings[0])}")
    print(f"Query embedding dimension: {len(domain_query_embedding)}")
except Exception as e:
    print(f"Error testing domain embeddings: {str(e)}")

**Comparing Different Embedding Models**

In [None]:
#Let's compare the different embedding approaches:

def compare_embeddings(texts: List[str], query: str, embedders: Dict[str, Embeddings]):
    """Compare different embedding models."""
    results = {}

    for name, embedder in embedders.items():
        try:
            start_time = time.time()

            # Generate embeddings
            doc_embeddings = embedder.embed_documents(texts)
            query_embedding = embedder.embed_query(query)

            # Calculate similarities
            similarities = [
                cosine_similarity(
                    np.array(query_embedding).reshape(1, -1),
                    np.array(doc_emb).reshape(1, -1)
                )[0][0]
                for doc_emb in doc_embeddings
            ]

            processing_time = time.time() - start_time

            results[name] = {
                'similarities': similarities,
                'processing_time': processing_time,
                'status': 'success'
            }
        except Exception as e:
            results[name] = {
                'status': 'error',
                'error': str(e)
            }

    return results

print("Comparing Embedding Models:")

# Compare embedding models that were successfully initialized
embedders = {
    'Simple': simple_embedder
}

# Add transformer embedder if initialization was successful
if 'transformer_embedder' in locals():
    embedders['Transformer'] = transformer_embedder

# Add domain embedder if initialization was successful
if 'domain_embedder' in locals():
    embedders['Domain-Specific'] = domain_embedder

query = "What is machine learning?"
comparison_results = compare_embeddings(test_texts, query, embedders)

# Print comparison results
print("\nEmbedding Models Comparison:")
for model_name, result in comparison_results.items():
    print(f"\n{model_name} Embeddings:")
    if result['status'] == 'success':
        print(f"Processing time: {result['processing_time']:.4f} seconds")
        print("Similarities with query:")
        for text, sim in zip(test_texts, result['similarities']):
            print(f"  {text}: {sim:.4f}")
    else:
        print(f"Error: {result['error']}")

**Save Results**

In [None]:
#Save the comparison results for later analysis:

import json
from datetime import datetime

# Prepare results for saving
save_results = {
    'timestamp': datetime.now().isoformat(),
    'comparison_results': {
        name: {
            'status': results['status'],
            'processing_time': results['processing_time'] if results['status'] == 'success' else None,
            'similarities': results['similarities'] if results['status'] == 'success' else None
        }
        for name, results in comparison_results.items()
    },
    'test_texts': test_texts,
    'query': query
}

# Save results to file
with open('embedding_comparison_results.json', 'w') as f:
    json.dump(save_results, f, indent=2)

print("\nResults saved to embedding_comparison_results.json")