In [None]:
import os
import json
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from sentence_transformers import SentenceTransformer
import chromadb
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set paths
doc_path = Path('../data/raw')
vector_db_path = Path('../data/vector_db')
vector_db_path.mkdir(parents=True, exist_ok=True)

# Load documents
documents = SimpleDirectoryReader(input_dir=doc_path).load_data()
print(f"Loaded {len(documents)} documents.")

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize ChromaDB
client = chromadb.PersistentClient(path=str(vector_db_path))
collection = client.get_or_create_collection(name="biodiversity_docs")

# Process and store embeddings
for i, doc in enumerate(documents):
    text = doc.text.replace("\n", " ").strip()  # Remove extra newlines
    metadata = {"file_name": doc.metadata.get("file_name", f"doc_{i}.pdf")}
    embedding = embedding_model.encode(text).tolist()
    
    collection.add(
        ids=[f"doc_{i}"],
        embeddings=[embedding],
        metadatas=[metadata]
    )
    print(f"Indexed: {metadata['file_name']}")

print("✅ Vector Indexing Complete!")

# Simple search function
def query_vector_db(query, top_k=3):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    
    for i, res in enumerate(results["metadatas"][0]):
        print(f"\n🔎 Match {i+1}: {res['file_name']}")
        print(results["documents"][0][i][:500])

# Example query
test_query = "Biodiversity investment risks"
query_vector_db(test_query)