In [3]:
# 02_build_vector_index.ipynb - Vector Embedding & Indexing Pipeline
# --------------------------------------------------
# - Loads preprocessed ESG/Biodiversity document chunks
# - Computes embeddings using SentenceTransformers
# - Stores vectors in ChromaDB (persistent)
# - Adds traceable metadata for better RAG citations
# - Includes semantic query interface with preview
# --------------------------------------------------

# 1. IMPORTS & SETUP
import os
import pickle
from pathlib import Path
from llama_index.core.schema import Document
from sentence_transformers import SentenceTransformer
import chromadb
from dotenv import load_dotenv
from tqdm import tqdm

# Load environment variables
load_dotenv()

# 2. LOAD PRE-PROCESSED CHUNKS
chunked_docs_path = Path("../outputs/flattened_docs.pkl")
if not chunked_docs_path.exists():
    raise FileNotFoundError(f"Missing preprocessed chunks: {chunked_docs_path}")

with open(chunked_docs_path, "rb") as f:
    flattened_docs = pickle.load(f)

print(f"Loaded {len(flattened_docs)} chunked documents.")

# 3. INITIALIZE EMBEDDING MODEL
embedding_model_name = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_name)
print(f"Loaded embedding model: {embedding_model_name}")

# 4. INITIALIZE VECTOR DATABASE (CHROMADB)
vector_db_path = Path("../data/vector_db")
vector_db_path.mkdir(parents=True, exist_ok=True)

client = chromadb.PersistentClient(path=str(vector_db_path))

# Delete existing collection if it exists
try:
    client.delete_collection("biodiversity_docs")
except Exception:
    pass  # May not exist yet

# Create new collection with metadata
collection = client.create_collection(
    name="biodiversity_docs",
    metadata={
        "hnsw:space": "cosine",
        "embedding_model": embedding_model_name
    }
)
print("ChromaDB collection initialized.")

# 5. COMPUTE EMBEDDINGS & STORE CHUNKS

def sanitize_metadata(metadata: dict):
    """Ensure all metadata values are str, int, float, or bool. Replace None with 'N/A'."""
    return {
        k: (v if isinstance(v, (str, int, float, bool)) else str(v) if v is not None else "N/A")
        for k, v in metadata.items()
    }

print("Indexing document chunks...")

chunk_texts = [chunk.text for chunk in flattened_docs]
chunk_embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True).tolist()

for i, chunk in enumerate(tqdm(flattened_docs)):
    raw_metadata = {
        "file_name": chunk.metadata.get("file_name", f"chunk_{i}.txt"),
        "chunk_id": i,
        "source_page": chunk.metadata.get("page_label", "N/A")
    }
    metadata = sanitize_metadata(raw_metadata)

    try:
        collection.add(
            ids=[f"chunk_{i}"],
            embeddings=[chunk_embeddings[i]],
            metadatas=[metadata],
            documents=[chunk.text] if chunk.text else ["[No content]"]
        )
    except Exception as e:
        print(f"Failed to index chunk {i}: {e}")

print(f"Indexed {len(flattened_docs)} document chunks.")

# 6. QUERY FUNCTION WITH COSINE SIMILARITY SCORING
def query_vector_db(query, top_k=3):
    """Semantic search over ChromaDB vector index."""
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    if not results or not results.get("documents") or not results["documents"][0]:
        print("No matching documents found.")
        return

    print(f"\nQuery: {query}")
    for i, (res, score) in enumerate(zip(results["metadatas"][0], results["distances"][0])):
        doc_text = results["documents"][0][i]
        preview_text = doc_text[:500] if doc_text else "[No preview available]"
        similarity = 1 - score  # Convert cosine distance to similarity

        print(f"\nMatch {i+1}: {res['file_name']} | Chunk ID: {res['chunk_id']} | Page: {res.get('source_page', 'N/A')} | Similarity: {similarity:.4f}")
        print(preview_text)

    print("\nQuery completed.")

# 7. TEST QUERY (Optional)
if __name__ == "__main__":
    test_query = "Biodiversity investment risks"
    query_vector_db(test_query, top_k=3)


Loaded 2652 chunked documents.
Loaded embedding model: all-MiniLM-L6-v2
ChromaDB collection initialized.
Indexing document chunks...


Batches:   0%|          | 0/83 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████| 2652/2652 [00:06<00:00, 416.10it/s]

Indexed 2652 document chunks.

Query: Biodiversity investment risks

Match 1: An Investor’s Guide to Biodiversity Risks.pdf | Chunk ID: 128 | Page: 1 | Similarity: 0.7581
In addition, several nature-related risks could have reverberating 
effects, which makes assessing financial risks from nature loss even more difficult. 
Furthermore, some key data points are rarely disclosed and may be inaccurate when they are. 
Collecting data for supply chains is an even greater challenge.  
However, none of these hurdles are insurmountable and investors may want to integrate biodiversity 
into their investment decisions for a number of reasons. Investors can take the followi

Match 2: JPM_The_Sustainable_Inve_2024-07-11_4700380.pdf | Chunk ID: 500 | Page: 1 | Similarity: 0.7229
During this period we have been 
surprised by the volume of questions from investors on the topic: despite being one of 
the most challenging themes to invest in (in our opinion), our Biodiversity report 
remains our most p


