In [None]:
#!/usr/bin/env python
# coding: utf-8

# # Ada's Spark Memory Pinecone POC
# 
# This notebook implements a proof-of-concept using Pinecone to create a vector database for Ada's memory.
# 
# ## Setup Instructions
# 
# ### 1. Install Required Packages
# 
# Run the following in your terminal or in a notebook cell:
# ```
# pip install --upgrade pinecone python-dotenv tqdm
# ```
# 
# ### 2. Create a .env File
# 
# Create a file named `.env` in the same directory as this notebook with your Pinecone API key:
# ```
# PINECONE_API_KEY=your_api_key_here
# ```
# 
# ### 3. Run the Notebook
# 
# Execute the cells below to create your Pinecone vector database and test search capabilities.


In [None]:
# On my personal machine: conda activate pinecone_poc

In [None]:
# Import necessary libraries
import json
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

In [None]:
# Load your JSON file
with open('../generated_qa_pairs_combined_clean_20250603_214922.json', 'r') as f:
    data = json.load(f)


In [None]:
# Extract question data
print(f"Loaded {len(data)} questions:")
for item in data:
    print(f"{item['question_id']}: {item['question_text']} ({len(item['answers'])} answers)")

In [None]:
# Load environment variables from .env file
load_dotenv()

In [None]:
# Initialize Pinecone client using API key from environment variables
api_key = os.getenv("PINECONE_API_KEY")
if not api_key:
    raise ValueError("PINECONE_API_KEY not found in environment variables. Please check your .env file.")

pc = Pinecone(api_key=api_key)

In [None]:
# Check existing indexes
print("Existing indexes:", pc.list_indexes())

In [None]:
# Define the embedding model to use
model_name = "llama-text-embed-v2"

# For llama-text-embed-v2, the dimension is 1024
model_dimension = 1024

print(f"Using embedding model: {model_name} with dimension {model_dimension}")

In [None]:
# Test the embedding API with a simple example to verify it works
try:
    test_embed = pc.inference.embed(
        model=model_name,
        inputs=["This is a test sentence."],
        parameters={"input_type": "passage"}
    )
    print(f"✅ Embedding API test successful!")
    
    # Access the values correctly based on the returned object type
    if hasattr(test_embed[0], 'values'):
        # New Pinecone client returns structured objects
        vector_values = test_embed[0].values
        actual_dimension = len(vector_values)
        print(f"  Produced a vector with dimension: {actual_dimension}")
    else:
        # Direct access if it's already a vector
        vector_values = test_embed[0]
        actual_dimension = len(vector_values)
        print(f"  Produced a vector with dimension: {actual_dimension}")
    
    # Verify dimension matches expected
    if actual_dimension != model_dimension:
        print(f"⚠️ Warning: Actual dimension ({actual_dimension}) doesn't match expected ({model_dimension})")
        # Update model_dimension to the actual value
        model_dimension = actual_dimension
        print(f"  Updated model_dimension to: {model_dimension}")
        
except Exception as e:
    print(f"❌ Embedding API test failed: {str(e)}")
    print("  Check your API key and model availability in your Pinecone account.")
    # Print the type and structure of the response for debugging
    if 'test_embed' in locals():
        print(f"  Response type: {type(test_embed)}")
        print(f"  First item type: {type(test_embed[0])}")
        print(f"  Response structure: {dir(test_embed[0])}")
    raise

### Create Pinecone Index
Now let's create a Pinecone index to store our question/answer embeddings.

In [None]:
# Define index name
index_name = "adas-memory-qa-poc"

# Check if the index already exists and delete it if needed
if index_name in [index.name for index in pc.list_indexes()]:
    print(f"Deleting existing index: {index_name}")
    pc.delete_index(index_name)


In [None]:
# Create a new Pinecone serverless index
pc.create_index(
    name=index_name,
    dimension=model_dimension,  # Use the dimension from the model
    metric="cosine",  # Change to match your desired similarity metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"  # Choose the region closest to you
    )
)

print(f"Created index '{index_name}' with dimension {model_dimension}")

In [None]:
# Connect to the newly created index
index = pc.Index(index_name)

# Check if the index is ready
index_stats = index.describe_index_stats()
print(f"Index stats: {index_stats}")

### Generate Embeddings and Insert Data
 
Now let's generate embeddings for all questions and insert them into Pinecone.


In [None]:
# Generate embeddings for all questions using Pinecone's hosted models
questions = [item['question_text'] for item in data]
print(f"Generating embeddings for {len(questions)} questions using {model_name}...")

In [None]:
# Generate embeddings in batches to respect API limits
def generate_embeddings_in_batches(texts, model_name, batch_size=90):
    """
    Generate embeddings in batches to respect API limits
    Using batch_size=90 to stay safely under the 96 limit
    """
    all_embeddings = []
    
    print(f"Generating embeddings for {len(texts)} questions in batches of {batch_size}...")
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        
        # Generate embeddings for this batch
        batch_response = pc.inference.embed(
            model=model_name,
            inputs=batch,
            parameters={"input_type": "query"}
        )
        
        # Extract embeddings from batch response
        batch_embeddings = []
        for embedding_obj in batch_response:
            if hasattr(embedding_obj, 'values'):
                batch_embeddings.append(embedding_obj.values)
            else:
                batch_embeddings.append(embedding_obj)
        
        all_embeddings.extend(batch_embeddings)
        
        # Small delay to be nice to the API
        time.sleep(0.1)
    
    return all_embeddings

In [None]:
# Use the batched function
embeddings = generate_embeddings_in_batches(questions, model_name)

In [None]:
print(f"Generated {len(embeddings)} embeddings of dimension {len(embeddings[0])}")

In [None]:
# Verify we got the expected dimension
if len(embeddings[0]) != model_dimension:
    print(f"Warning: Embedding dimension ({len(embeddings[0])}) doesn't match expected dimension ({model_dimension})")
    # Update model_dimension to the actual value
    model_dimension = len(embeddings[0])

In [None]:
# Prepare data for insertion
pinecone_records = []

for i, item in enumerate(data):
    # Store answers as JSON string since Pinecone metadata has size limits
    record = {
        "id": item['question_id'],  # Using question_id as the vector ID
        "values": embeddings[i],    # Vector values from the embeddings list
        "metadata": {
            "question_text": item['question_text'],
            "category": item['category'],
            "answers_json": json.dumps(item['answers'])  # Store answers as JSON string
        }
    }
    pinecone_records.append(record)

print(f"Prepared {len(pinecone_records)} records for insertion")

In [None]:
# Insert the data into Pinecone
# Note: Pinecone accepts batches of up to 100 vectors, so we'll insert in batches
batch_size = 100
for i in range(0, len(pinecone_records), batch_size):
    batch = pinecone_records[i:i+batch_size]
    index.upsert(vectors=batch)

print(f"Inserted {len(pinecone_records)} records into Pinecone")


In [None]:
# Wait a moment for indexing to complete
print("Waiting for indexing to complete...")
time.sleep(10)  # Adding a delay to ensure indexing completes
print("Waiting complete")

In [None]:
# Verify insertion
index_stats = index.describe_index_stats()
print(f"Index now contains {index_stats['total_vector_count']} vectors")

### Search Function

Let's create a function to search for similar questions based on user queries.


In [None]:
def search_similar_questions(query_text, limit=5, include_answers=True):
    """
    Search for questions similar to the query text
    
    Parameters:
    - query_text: The text to search for
    - limit: Maximum number of results to return
    - include_answers: Whether to include answers in the results

    Returns:
    - List of matching questions with their data
    """
    # Generate embedding for the query using Pinecone's hosted model
    query_embedding = pc.inference.embed(
        model=model_name,
        inputs=[query_text],
        parameters={"input_type": "query"}  # Changed from "passage" to "query"
    )
    
    # Extract vector values correctly
    if hasattr(query_embedding[0], 'values'):
        query_vector = query_embedding[0].values
    else:
        query_vector = query_embedding[0]
    
    # Search Pinecone
    results = index.query(
        vector=query_vector,
        top_k=limit,
        include_metadata=True
    )
    
    # Process results
    formatted_results = []
    for match in results.matches:
        result_item = {
            "question_id": match.id,
            "question_text": match.metadata.get("question_text", ""),
            "category": match.metadata.get("category", ""),
            "score": match.score,  # Cosine similarity score
        }
        
        # Parse the JSON string back to a list if answers are included
        if include_answers and "answers_json" in match.metadata:
            result_item["answers"] = json.loads(match.metadata["answers_json"])
            
        formatted_results.append(result_item)
    
    return formatted_results

In [None]:
# Test the search function with various queries
test_queries = [
    "What was Ada's personality like?",
    "What was Ada like as a person?",
    "How did Ada have fun during treatment?",
    "What were Ada's favorite activities or things to do?",
    "What impact did Ada have on people around her?",
    "Tell me about a funny Ada story",
    "How did the family handle difficult times?"
]

In [None]:
for query in test_queries:
    print("\n" + "=" * 80)
    print(f"Query: '{query}'")
    print("=" * 80)
    
    results = search_similar_questions(query, limit=2)
    
    print("\nTop matches:")
    for i, result in enumerate(results):
        print(f"\n{i+1}. Question ID: {result['question_id']}")
        print(f"   Question: {result['question_text']}")
        print(f"   Category: {result['category']}")
        print(f"   Similarity Score: {result['score']:.4f}")
        
        if "answers" in result:
            print(f"   Number of answers: {len(result['answers'])}")
            # Show first answer as example
            if result['answers']:
                first_answer = result['answers'][0]
                print(f"   First answer: {first_answer['answer_text'][:200]}..." if len(first_answer['answer_text']) > 200 
                      else f"   First answer: {first_answer['answer_text']}")


# Potential Future Directions to Improve Search

In [None]:
# see chat here for more details: https://claude.ai/chat/ee414354-7c79-40d3-9806-6cda04792aaf

# FUTURE IMPROVEMENTS
# ==================
#
# 1. Hybrid Search
#    Implement hybrid search to combine vector similarity with keyword matching.
#    This helps when users use different terminology than what's in our questions.
#
# 2. Basic Re-ranking
#    Add post-processing that re-ranks results based on exact phrase matches,
#    keyword overlap, and category-specific boosts. This can improve relevancy
#    without external APIs.
#
# 3. Question Expansion
#    Generate alternative phrasings for each question in the dataset.
#    For example, "What was Ada like?" could become "Tell me about Ada's personality".
#    This creates more entry points for queries.
#
# 4. Synonym Handling
#    Create mappings of synonyms for important terms (e.g., "funny" → "humorous",
#    "activities" → "hobbies"). Expand queries with these synonyms to improve matching.
#
# 5. Multi-Strategy Search
#    Try several search approaches in parallel (vector, hybrid, expanded) and
#    combine the results with different weights.
#
# 6. Content Expansion
#    Add more questions and answers to the dataset. Search quality typically
#    improves dramatically with more content, especially when targeting different
#    ways people might phrase the same question.
#
# 7. Track & Learn from Queries
#    Log actual search queries and use them to identify gaps in coverage.
#    Add new questions based on common user searches.
#
# 8. Parameter Tuning
#    Experiment with different values for hybrid search alpha, re-ranking weights,
#    and similarity thresholds to optimize for our specific use case.