### Token Management


Without limits:
User: "Tell me about..."
AI: Generates 2000 tokens = $0.024

With max_tokens=500:
User: "Tell me about..."
AI: Generates 500 tokens = $0.006

Savings: 75% per query

In [1]:
def estimate_tokens(text):
    """Rough estimate: ~4 characters per token"""
    return len(text) // 4

def truncate_context(messages, max_tokens=4000):
    """Keep conversation under token limit"""
    total_tokens = sum(estimate_tokens(m['content']) for m in messages)
    
    while total_tokens > max_tokens and len(messages) > 1:
        # Remove oldest messages (keep last ones)
        messages.pop(0)
        total_tokens = sum(estimate_tokens(m['content']) for m in messages)
    
    return messages

In [2]:
# Response Caching

import hashlib
import json

response_cache = {}

def get_cached_or_generate(query):
    # Create hash of query
    query_hash = hashlib.md5(query.encode()).hexdigest()
    
    # Check cache
    if query_hash in response_cache:
        print("Cache hit!")
        return response_cache[query_hash]
    
    # Generate new response
    response = call_bedrock(query)
    
    # Cache it
    response_cache[query_hash] = response
    return response


In [4]:
# Semantic Caching

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def find_similar_cached_query(new_query, threshold=0.9):
    """Find if we've answered a similar question before"""
    new_embedding = model.encode(new_query)
    
    for cached_query, response in cache.items():
        cached_embedding = model.encode(cached_query)
        similarity = cosine_similarity(new_embedding, cached_embedding)
        
        if similarity > threshold:
            return response  # Reuse cached response
    
    return None  # No match, need to generate

ModuleNotFoundError: No module named 'sentence_transformers'