# AWS Services Embeddings Creation

In [None]:
import json
from pathlib import Path
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import re

from utils.embedding_checkpoint import EmbeddingCheckpointManager

In [None]:
class AWSEmbeddingCreator:
    def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", batch_size: int = 32, 
                 checkpoint_file: str = "embeddings/embeddings_checkpoint.json"):
        """Initialize the embedding creator with the specified model."""
        self.model = SentenceTransformer(
            model_name,
            model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
            tokenizer_kwargs={"padding_side": "left"}
        )
        self.batch_size = batch_size
        self.checkpoint_manager = EmbeddingCheckpointManager(checkpoint_file)
        
    def clean_text(self, text: str) -> str:
        """Clean and normalize text for embedding."""
        if not text:
            return ""
        # Remove excessive whitespace and newlines
        text = " ".join(text.split())
        return text.strip()
    
    def convert_case_to_words(self, name: str) -> str:
        """Convert PascalCase, camelCase, or snake_case to words."""
        if not name:
            return ""
        
        # Handle snake_case
        name = name.replace("_", " ")
        
        # Handle PascalCase and camelCase
        # Add space before uppercase letters that follow lowercase letters
        result = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
        # Add space before uppercase letters that are followed by lowercase letters
        result = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', result)
        
        return result.strip()
    
    def convert_names_in_description(self, description: str) -> str:
        """Convert PascalCase/camelCase names in description to space-separated words."""
        if not description:
            return ""
        
        # Find words that are likely PascalCase/camelCase (start with capital, contain mixed case)
        # This regex finds words that start with a capital and have at least one lowercase followed by uppercase
        pattern = r'\b[A-Z][a-z]*[A-Z][a-zA-Z]*\b'
        
        def replace_case(match):
            return self.convert_case_to_words(match.group())
        
        # Replace PascalCase/camelCase words with space-separated versions
        result = re.sub(pattern, replace_case, description)
        
        return result.strip()
    
    def format_parameters(self, parameters: List[Dict[str, Any]]) -> str:
        """Format parameters into a string representation."""
        if not parameters:
            return ""
        
        param_strs = []
        for param in parameters:
            param_name = self.convert_case_to_words(param.get('name', ''))
            param_desc = self.clean_text(param.get('description', ''))
            # Convert case in parameter description too
            param_desc = self.convert_names_in_description(param_desc)
            
            if param_name and param_desc:
                param_str = f"{param_name}: {param_desc}"
            elif param_name:
                param_str = param_name
            else:
                continue
                
            param_strs.append(param_str)
        
        return "Parameters: " + " | ".join(param_strs) if param_strs else ""
    
    def format_return_structure(self, return_structure: List[Dict[str, Any]]) -> str:
        """Format return structure into a string representation."""
        if not return_structure:
            return ""
        
        return_strs = []
        for item in return_structure:
            nested_items = item.get('nested_items', [])
            if nested_items:
                for nested in nested_items:
                    field_name = self.convert_case_to_words(nested.get('name', ''))
                    field_desc = self.clean_text(nested.get('description', ''))
                    # Convert case in field description too
                    field_desc = self.convert_names_in_description(field_desc)
                    
                    if field_name and field_desc:
                        return_str = f"{field_name}: {field_desc}"
                    elif field_name:
                        return_str = field_name
                    else:
                        continue
                        
                    return_strs.append(return_str)
        
        return "Returns: " + " | ".join(return_strs) if return_strs else ""
    
    def create_service_format(self, service_data: Dict[str, Any]) -> str:
        """Create the single format representation for a service."""
        name = service_data['service_name']
        description = self.clean_text(service_data['client'].get('description', ''))
        
        # Convert PascalCase/camelCase names in description to words
        description = self.convert_names_in_description(description)
        
        # Convert name to words
        name_words = self.convert_case_to_words(name)
        
        # Combine name and description
        if description:
            return f"{name_words}. {description}"
        else:
            return name_words
    
    def create_method_formats(self, method_data: Dict[str, Any], 
                            service_name: str, service_description: str) -> Dict[str, str]:
        """Create the three format representations for a method."""
        method_name = method_data.get('method_name', '')
        method_description = self.clean_text(method_data.get('description', ''))
        
        # Convert PascalCase/camelCase names in description to words
        method_description = self.convert_names_in_description(method_description)
        
        # Convert names to words
        method_name_words = self.convert_case_to_words(method_name)
        service_name_words = self.convert_case_to_words(service_name)
        
        # Format parameters and returns
        params = self.format_parameters(method_data.get('parameters', []))
        returns = self.format_return_structure(method_data.get('return_structure', []))
        
        formats = {}
        
        # Format 1: method name + description
        if method_description:
            formats['method_only'] = f"{method_name_words}. {method_description}"
        else:
            formats['method_only'] = method_name_words
        
        # Format 2: service + method + parameters
        parts = []
        if service_name_words and service_description:
            parts.append(f"Service: {service_name_words}. {service_description}")
        elif service_name_words:
            parts.append(f"Service: {service_name_words}")
            
        if method_description:
            parts.append(f"Method: {method_name_words}. {method_description}")
        else:
            parts.append(f"Method: {method_name_words}")
            
        if params:
            parts.append(params)
            
        formats['with_service_params'] = " ".join(parts)
        
        # Format 3: service + method + parameters + returns
        if returns:
            formats['with_service_params_returns'] = formats['with_service_params'] + " " + returns
        else:
            formats['with_service_params_returns'] = formats['with_service_params']
        
        return formats
    
    def create_embeddings_batch(self, texts: List[str]) -> np.ndarray:
        """Create embeddings for a batch of texts."""
        # Filter out empty texts
        texts = [t for t in texts if t and t.strip()]
        if not texts:
            return np.array([])
        
        embeddings = self.model.encode(texts, batch_size=self.batch_size, show_progress_bar=False)
        return embeddings
    
    def save_service_embedding(self, service_name: str, embedding: np.ndarray, 
                              formatted_text: str, output_dir: Path):
        """Save individual service embedding to file."""
        service_file = output_dir / "services" / f"{service_name}.json"
        service_file.parent.mkdir(parents=True, exist_ok=True)
        
        data = {
            "service_name": service_name,
            "formatted_text": formatted_text,
            "embedding": embedding.tolist()
        }
        
        with open(service_file, 'w') as f:
            json.dump(data, f, indent=2)
    
    def save_method_embedding(self, service_name: str, method_name: str, 
                            formats: Dict[str, str], embeddings: Dict[str, np.ndarray], 
                            output_dir: Path):
        """Save individual method embedding to file."""
        method_file = output_dir / "methods" / service_name / f"{method_name}.json"
        method_file.parent.mkdir(parents=True, exist_ok=True)
        
        data = {
            "service_name": service_name,
            "method_name": method_name,
            "formats": formats,
            "embeddings": {k: v.tolist() for k, v in embeddings.items()}
        }
        
        with open(method_file, 'w') as f:
            json.dump(data, f, indent=2)
    
    def process_services(self, services_dir: Path, output_dir: Path) -> Dict[str, str]:
        """Process all service files and create embeddings with checkpoint support."""
        self.checkpoint_manager.set_phase("services")
        
        service_descriptions = {}  # Store for later use with methods
        service_files = list(services_dir.glob("*.json"))
        
        # Check resume info
        resume_info = self.checkpoint_manager.get_resume_info()
        print(f"Resume info: {resume_info}")
        
        # Filter out already processed services
        services_to_process = []
        for service_file in service_files:
            service_name = service_file.stem
            if not self.checkpoint_manager.is_service_processed(service_name):
                services_to_process.append(service_file)
            else:
                # Still need to load description for methods processing
                with open(service_file, 'r') as f:
                    service_data = json.load(f)
                    service_desc = self.clean_text(service_data['client'].get('description', ''))
                    cleaned_desc = self.convert_names_in_description(service_desc)
                    service_descriptions[service_name] = cleaned_desc
        
        if services_to_process:
            print(f"Processing {len(services_to_process)} remaining services (out of {len(service_files)} total)...")
            
            for service_file in tqdm(services_to_process, desc="Processing services"):
                with open(service_file, 'r') as f:
                    service_data = json.load(f)
                    service_name = service_data['service_name']
                    
                    # Store clean description for methods
                    service_desc = self.clean_text(service_data['client'].get('description', ''))
                    cleaned_desc = self.convert_names_in_description(service_desc)
                    service_descriptions[service_name] = cleaned_desc
                    
                    # Create format
                    formatted_text = self.create_service_format(service_data)
                    
                    # Create embedding
                    embedding = self.create_embeddings_batch([formatted_text])[0]
                    
                    # Save individual service embedding
                    self.save_service_embedding(service_name, embedding, formatted_text, output_dir)
                    
                    # Mark as processed
                    self.checkpoint_manager.mark_service_as_processed(service_name)
        else:
            print("All services already processed!")
        
        return service_descriptions
    
    def process_methods(self, methods_dir: Path, service_descriptions: Dict[str, str], 
                       output_dir: Path):
        """Process all method files and create embeddings with checkpoint support."""
        self.checkpoint_manager.set_phase("methods")
        
        service_dirs = [d for d in methods_dir.iterdir() if d.is_dir()]
        print(f"\nProcessing methods for {len(service_dirs)} services...")
        
        for service_dir in tqdm(service_dirs, desc="Processing method services"):
            service_name = service_dir.name
            service_desc = service_descriptions.get(service_name, "")
            
            method_files = list(service_dir.glob("*.json"))
            
            # Filter out already processed methods
            methods_to_process = []
            for method_file in method_files:
                method_name = method_file.stem
                if not self.checkpoint_manager.is_method_processed(service_name, method_name):
                    methods_to_process.append(method_file)
            
            if not methods_to_process:
                continue  # Skip if all methods in this service are processed
            
            for method_file in tqdm(methods_to_process, desc=f"Methods in {service_name}", leave=False):
                with open(method_file, 'r') as f:
                    method_data = json.load(f)
                    method_name = method_data.get('method_name', '')
                    
                    if not method_name:  # Skip if no method name
                        continue
                    
                    # Create formats
                    formats = self.create_method_formats(method_data, service_name, service_desc)
                    
                    # Create embeddings for each format
                    method_embeddings = {}
                    for format_name, format_text in formats.items():
                        embedding = self.create_embeddings_batch([format_text])[0]
                        method_embeddings[format_name] = embedding
                    
                    # Save individual method embedding
                    self.save_method_embedding(service_name, method_name, formats, 
                                             method_embeddings, output_dir)
                    
                    # Mark as processed
                    self.checkpoint_manager.mark_method_as_processed(service_name, method_name)
    
    def get_checkpoint_status(self):
        """Get current checkpoint status."""
        return self.checkpoint_manager.get_resume_info()
    
    def reset_checkpoint(self):
        """Reset checkpoint to start fresh."""
        self.checkpoint_manager.reset_checkpoint()

In [None]:
creator = AWSEmbeddingCreator(batch_size=32)

# Define paths
docs_dir = Path("docs_subset")
services_dir = docs_dir / "services_subset"
methods_dir = docs_dir / "methods_subset"
output_dir = Path("embeddings")

# Create output directory if it doesn't exist
output_dir.mkdir(exist_ok=True)

# Show checkpoint status
checkpoint_status = creator.get_checkpoint_status()
print("üîÑ Checkpoint Status:")
print(f"üìç Current phase: {checkpoint_status['phase']}")
print(f"‚úÖ Services processed: {checkpoint_status['services_processed']}")
print(f"‚úÖ Methods processed: {checkpoint_status['methods_processed']}")
print("=" * 50)

In [None]:
# Uncomment the line below if you want to start fresh and reset the checkpoint
# creator.reset_checkpoint()
# print("üîÑ Checkpoint reset! Starting fresh...")

In [None]:
# Process services
print("Processing services...")
service_descriptions = creator.process_services(services_dir, output_dir)
print(f"Processed {len(service_descriptions)} services")

In [None]:
# Process methods
print("\nProcessing methods...")
creator.process_methods(methods_dir, service_descriptions, output_dir)

In [None]:
# Count total files created
service_files = len(list((output_dir / "services").glob("*.json")))
method_files = sum(len(list(service_dir.glob("*.json"))) 
                    for service_dir in (output_dir / "methods").iterdir() 
                    if service_dir.is_dir())

# Final checkpoint status
final_status = creator.get_checkpoint_status()

print(f"\n ‚úÖ Embedding creation complete!")
print(f"üìÇ Created {service_files} service embedding files")
print(f"üìÇ Created {method_files} method embedding files")

In [None]:
print("\nüîÑ Final Checkpoint Status:")
print(f"üìç Phase: {final_status['phase']}")
print(f"‚úÖ Services processed: {final_status['services_processed']}")
print(f"‚úÖ Methods processed: {final_status['methods_processed']}")
print("\nüíæ Progress saved to embeddings_checkpoint.json")

In [None]:
# Save metadata about formats
metadata = {
    "service_format": "Service name (converted to words) + description (with PascalCase/camelCase converted)",
    "method_formats": {
        "method_only": "Method name + description",
        "with_service_params": "Service name + service description + method name + method description + parameters",
        "with_service_params_returns": "Service name + service description + method name + method description + parameters + returns"
    },
    "notes": {
        "case_handling": "All PascalCase, camelCase, and snake_case names are converted to space-separated words",
        "description_processing": "PascalCase/camelCase names within descriptions are converted to words",
        "file_structure": "Each service and method gets its own JSON file with embedding and formatted text"
    }
}

with open(output_dir / "embedding_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)