# 06 - Vector Store Setup

This notebook sets up ChromaDB for semantic search and context retrieval.

## Objectives
- Initialize ChromaDB with persistent storage
- Chunk and embed training contexts from SoQG
- Build a searchable vector index
- Test retrieval with sample queries
- Integrate with the retrieval pipeline

## Why Vector Store?
The vector store enables:
1. **Local Context Retrieval** - Find similar contexts from training data
2. **Hybrid Search** - Combine with Wikipedia/Gemini for comprehensive results
3. **Fast Inference** - Sub-second similarity search

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
from datetime import datetime

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

SEED = 42
np.random.seed(SEED)

DATA_DIR = Path("../datasets/processed")
VECTOR_STORE_DIR = Path("../backend/vector_store")
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)

print(f"Vector store will be saved to: {VECTOR_STORE_DIR.absolute()}")

## 2. Configuration

In [None]:
config = {
    "embedding_model": "all-MiniLM-L6-v2",
    "embedding_dim": 384,
    "chunk_size": 500,
    "chunk_overlap": 100,
    "collection_name": "soqg_contexts",
    "distance_metric": "cosine"
}

print("Vector Store Configuration:")
for k, v in config.items():
    print(f"  {k}: {v}")

## 3. Load Training Data

In [None]:
train_df = pd.read_csv(DATA_DIR / "train_clean.csv")

print(f"Loaded {len(train_df)} training samples")
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nQuestion types: {train_df['question_type'].unique().tolist()}")

In [None]:
contexts = train_df['context'].dropna().unique().tolist()
print(f"Unique contexts: {len(contexts)}")

context_lengths = [len(c) for c in contexts]
print(f"\nContext length stats:")
print(f"  Mean: {np.mean(context_lengths):.0f} chars")
print(f"  Median: {np.median(context_lengths):.0f} chars")
print(f"  Max: {np.max(context_lengths)} chars")

## 4. Text Chunking

Split long contexts into overlapping chunks for better retrieval.

In [None]:
def chunk_text(text, chunk_size=500, overlap=100):
    """Split text into overlapping chunks."""
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        
        if end < len(text):
            space_idx = text.rfind(' ', start, end)
            if space_idx > start:
                end = space_idx
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        start = end - overlap
        if start >= len(text) - overlap:
            break
    
    return chunks

In [None]:
all_chunks = []
chunk_metadata = []

for idx, context in enumerate(tqdm(contexts, desc="Chunking")):
    chunks = chunk_text(context, config['chunk_size'], config['chunk_overlap'])
    
    for chunk_idx, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        chunk_metadata.append({
            "context_id": idx,
            "chunk_idx": chunk_idx,
            "total_chunks": len(chunks),
            "char_length": len(chunk)
        })

print(f"\nCreated {len(all_chunks)} chunks from {len(contexts)} contexts")
print(f"Average chunks per context: {len(all_chunks)/len(contexts):.2f}")

## 5. Initialize Embedding Model

In [None]:
embedding_model = SentenceTransformer(config['embedding_model'])

test_embedding = embedding_model.encode(["test sentence"])
print(f"Embedding model: {config['embedding_model']}")
print(f"Embedding dimension: {test_embedding.shape[1]}")

## 6. Generate Embeddings

In [None]:
batch_size = 64
all_embeddings = []

for i in tqdm(range(0, len(all_chunks), batch_size), desc="Embedding"):
    batch = all_chunks[i:i+batch_size]
    embeddings = embedding_model.encode(batch, show_progress_bar=False)
    all_embeddings.extend(embeddings)

all_embeddings = np.array(all_embeddings)
print(f"\nGenerated embeddings shape: {all_embeddings.shape}")

## 7. Initialize ChromaDB

In [None]:
client = chromadb.PersistentClient(path=str(VECTOR_STORE_DIR))

try:
    client.delete_collection(config['collection_name'])
    print(f"Deleted existing collection: {config['collection_name']}")
except:
    pass

collection = client.create_collection(
    name=config['collection_name'],
    metadata={"hnsw:space": config['distance_metric']}
)

print(f"Created collection: {config['collection_name']}")

## 8. Add Documents to Collection

In [None]:
batch_size = 1000

for i in tqdm(range(0, len(all_chunks), batch_size), desc="Adding to ChromaDB"):
    end_idx = min(i + batch_size, len(all_chunks))
    
    batch_ids = [f"chunk_{j}" for j in range(i, end_idx)]
    batch_documents = all_chunks[i:end_idx]
    batch_embeddings = all_embeddings[i:end_idx].tolist()
    batch_metadata = chunk_metadata[i:end_idx]
    
    collection.add(
        ids=batch_ids,
        documents=batch_documents,
        embeddings=batch_embeddings,
        metadatas=batch_metadata
    )

print(f"\nTotal documents in collection: {collection.count()}")

## 9. Test Retrieval

In [None]:
def search(query, n_results=5):
    """Search the vector store for similar contexts."""
    query_embedding = embedding_model.encode([query]).tolist()
    
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results,
        include=["documents", "distances", "metadatas"]
    )
    
    return results

In [None]:
test_queries = [
    "climate change and global warming effects",
    "machine learning algorithms and neural networks",
    "photosynthesis in plants",
    "economic policies and market regulation",
    "human rights and social justice"
]

print("Retrieval Test Results:")
print("="*80)

for query in test_queries:
    results = search(query, n_results=3)
    print(f"\nQuery: {query}")
    print("-"*60)
    
    for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
        similarity = 1 - dist
        print(f"  [{i+1}] Similarity: {similarity:.4f}")
        print(f"      {doc[:150]}...")

## 10. Create Retrieval Service Class

In [None]:
class VectorStoreRetriever:
    """Retrieval service for the vector store."""
    
    def __init__(self, persist_dir, collection_name, embedding_model_name='all-MiniLM-L6-v2'):
        self.client = chromadb.PersistentClient(path=str(persist_dir))
        self.collection = self.client.get_collection(collection_name)
        self.embedding_model = SentenceTransformer(embedding_model_name)
    
    def search(self, query, n_results=5, min_similarity=0.3):
        """Search for similar contexts."""
        query_embedding = self.embedding_model.encode([query]).tolist()
        
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results,
            include=["documents", "distances", "metadatas"]
        )
        
        formatted = []
        for doc, dist, meta in zip(
            results['documents'][0],
            results['distances'][0],
            results['metadatas'][0]
        ):
            similarity = 1 - dist
            if similarity >= min_similarity:
                formatted.append({
                    "text": doc,
                    "similarity": float(similarity),
                    "metadata": meta
                })
        
        return formatted
    
    def get_context_for_question(self, question, keyphrases=None, n_results=5):
        """Get context for question generation."""
        if keyphrases:
            query = question + " " + " ".join(keyphrases)
        else:
            query = question
        
        results = self.search(query, n_results=n_results)
        
        combined_context = "\n\n".join([r['text'] for r in results])
        return {
            "context": combined_context,
            "sources": results
        }

In [None]:
retriever = VectorStoreRetriever(
    persist_dir=VECTOR_STORE_DIR,
    collection_name=config['collection_name'],
    embedding_model_name=config['embedding_model']
)

test_result = retriever.get_context_for_question(
    "What are the effects of deforestation?",
    keyphrases=["environment", "trees", "ecosystem"]
)

print("Test Retrieval:")
print(f"Query: What are the effects of deforestation?")
print(f"Retrieved {len(test_result['sources'])} sources")
print(f"\nFirst source (similarity: {test_result['sources'][0]['similarity']:.3f}):")
print(test_result['sources'][0]['text'][:300])

## 11. Save Retriever Module

In [None]:
retriever_code = '''
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Optional

class VectorStoreRetriever:
    def __init__(self, persist_dir: str, collection_name: str, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.collection = self.client.get_collection(collection_name)
        self.embedding_model = SentenceTransformer(embedding_model_name)
    
    def search(self, query: str, n_results: int = 5, min_similarity: float = 0.3) -> List[Dict]:
        query_embedding = self.embedding_model.encode([query]).tolist()
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results,
            include=["documents", "distances", "metadatas"]
        )
        formatted = []
        for doc, dist, meta in zip(results['documents'][0], results['distances'][0], results['metadatas'][0]):
            similarity = 1 - dist
            if similarity >= min_similarity:
                formatted.append({"text": doc, "similarity": float(similarity), "metadata": meta})
        return formatted
    
    def get_context_for_question(self, question: str, keyphrases: Optional[List[str]] = None, n_results: int = 5) -> Dict:
        query = question + " " + " ".join(keyphrases) if keyphrases else question
        results = self.search(query, n_results=n_results)
        combined_context = "\\n\\n".join([r['text'] for r in results])
        return {"context": combined_context, "sources": results}
'''

with open(VECTOR_STORE_DIR / "retriever.py", "w") as f:
    f.write(retriever_code)

print(f"Retriever module saved to {VECTOR_STORE_DIR / 'retriever.py'}")

## 12. Save Configuration

In [None]:
config["created_at"] = datetime.now().isoformat()
config["total_chunks"] = len(all_chunks)
config["total_contexts"] = len(contexts)

with open(VECTOR_STORE_DIR / "config.json", "w") as f:
    json.dump(config, f, indent=2)

print("Configuration saved:")
print(json.dumps(config, indent=2))

## 13. Retrieval Quality Analysis

In [None]:
sample_questions = train_df.sample(100, random_state=SEED)

retrieval_scores = []

for _, row in tqdm(sample_questions.iterrows(), total=len(sample_questions), desc="Testing"):
    context = row['context']
    question = row['target']
    
    results = retriever.search(question, n_results=5)
    
    if results:
        top_similarity = results[0]['similarity']
        found_exact = any(context in r['text'] or r['text'] in context for r in results)
    else:
        top_similarity = 0
        found_exact = False
    
    retrieval_scores.append({
        'top_similarity': top_similarity,
        'found_exact': found_exact
    })

scores_df = pd.DataFrame(retrieval_scores)

print("\nRetrieval Quality Metrics:")
print(f"  Mean top similarity: {scores_df['top_similarity'].mean():.4f}")
print(f"  Exact match rate: {scores_df['found_exact'].mean()*100:.1f}%")
print(f"  Queries with similarity > 0.5: {(scores_df['top_similarity'] > 0.5).mean()*100:.1f}%")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.hist(scores_df['top_similarity'], bins=30, color='#3B82F6', edgecolor='white', alpha=0.7)
plt.axvline(scores_df['top_similarity'].mean(), color='red', linestyle='--', label=f"Mean: {scores_df['top_similarity'].mean():.3f}")
plt.xlabel('Top Similarity Score')
plt.ylabel('Frequency')
plt.title('Distribution of Top Retrieval Similarity Scores')
plt.legend()
plt.tight_layout()
plt.savefig(VECTOR_STORE_DIR / "retrieval_quality.png", dpi=150)
plt.show()

## 14. Summary

### What Was Built
- ChromaDB vector store with ~X chunks from training data
- Sentence-transformer embeddings (all-MiniLM-L6-v2)
- Semantic search retrieval service
- Quality evaluation on sample questions

### Files Created
| File | Purpose |
|------|--------|
| `vector_store/` | ChromaDB persistent storage |
| `retriever.py` | Python module for retrieval |
| `config.json` | Vector store configuration |
| `retrieval_quality.png` | Quality analysis chart |

### Integration
Load in backend:
```python
from vector_store.retriever import VectorStoreRetriever

retriever = VectorStoreRetriever(
    persist_dir="./vector_store",
    collection_name="soqg_contexts"
)

results = retriever.search("your query here")
```

### Next Steps
1. Integrate retriever with FastAPI backend
2. Combine with Wikipedia/Gemini for hybrid search
3. Test end-to-end question generation pipeline