In [1]:
# 03_answer_with_gpt.ipynb - ESG Biodiversity RAG Assistant
# SECTION 1: Imports & Setup
import os
import time
from pathlib import Path
from dotenv import load_dotenv
import chromadb
from sentence_transformers import SentenceTransformer
import openai

# Load environment variables
load_dotenv(dotenv_path="/Users/jfsg/biodiversity-rag-nlp/.env", override=True)
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load ChromaDB client and collection
vector_db_path = Path("../data/vector_db")
chroma_client = chromadb.PersistentClient(path=str(vector_db_path))
collection = chroma_client.get_collection(name="biodiversity_docs")

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
# SECTION 2: Core Functions
def retrieve_relevant_chunks(query, top_k=5):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    chunks, citations = [], []
    for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
        similarity = 1 - dist
        citation = f"{meta['file_name']} | Chunk ID: {meta['chunk_id']} | Page: {meta.get('source_page', 'N/A')} | Similarity: {similarity:.4f}"
        chunks.append(doc)
        citations.append(citation)
    return chunks, citations

def compose_prompt(user_query, context_chunks):
    system_prompt = (
        "You are a helpful ESG research assistant. Answer using ONLY the provided context.\n"
        "Be precise and do not hallucinate. Format the answer in Markdown."
    )
    context_text = "\n\n".join([f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])
    return f"{system_prompt}\n\n### User Question:\n{user_query}\n\n### Context:\n{context_text}\n\n### Answer:"

def ask_gpt(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert assistant in ESG and biodiversity investing."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=1024
    )
    return response['choices'][0]['message']['content'].strip()

def answer_query_with_rag(user_query, top_k=5):
    print("\nRetrieving relevant documents...")
    chunks, citations = retrieve_relevant_chunks(user_query, top_k)
    print("Assembling prompt for GPT...")
    prompt = compose_prompt(user_query, chunks)
    print("Querying GPT...")
    answer = ask_gpt(prompt)
    print("\n--- Final Answer (Markdown Rendered Below) ---\n")
    print(answer)
    print("\n--- Bibliography ---\n")
    for i, citation in enumerate(citations):
        print(f"[{i+1}] {citation}")

In [3]:
# SECTION 3: Choose Execution Mode
# A. Single Static Question Mode
# user_query = "What are the financial risks of biodiversity loss for institutional investors?"
# answer_query_with_rag(user_query)

# B. Batch Mode - Multiple Questions
batch_queries = [
    "What are the financial risks of biodiversity loss for institutional investors?",
    "What is the role of TNFD in biodiversity risk disclosure?",
    "How can investors integrate biodiversity KPIs into portfolios?",
    "Examples of companies with high biodiversity risk exposure"
]
for q in batch_queries:
    print("\n" + "="*80)
    print(f"Question: {q}")
    print("="*80)
    answer_query_with_rag(q)

    # Pause to avoid hitting token limits
    time.sleep(10)  # ⏳ Wait 10 seconds before the next query
    

# C. Interactive Mode
# while True:
#     user_query = input("\nAsk a question (or 'exit' to quit): ")
#     if user_query.lower() == 'exit':
#         break
#     answer_query_with_rag(user_query)


Question: What are the financial risks of biodiversity loss for institutional investors?

Retrieving relevant documents...
Assembling prompt for GPT...
Querying GPT...

--- Final Answer (Markdown Rendered Below) ---

The financial risks of biodiversity loss for institutional investors are multifaceted and can impact various industries at the company or portfolio level. These risks include:

1. **Operational or Supply-Chain Disruptions**: Companies that depend on ecosystem services or at-risk natural commodities may face operational or supply-chain disruptions due to biodiversity loss. Such disruptions can affect the share of revenues or assets of these companies.

2. **Regulatory, Legal, or Reputational Risks**: As the business environment evolves, companies may face transition risks related to regulatory, legal, or reputational issues. These risks can affect the share of revenues or assets of companies that are exposed to them.

3. **Physical Risks**: Companies may be exposed to a ra