In [1]:
from dotenv import load_dotenv
import os

load_dotenv()  # This should load from .env in the same directory

api_key = os.getenv("OPENAI_API_KEY")
print("Loaded API key:", api_key if api_key else "Not loaded")


Loaded API key: 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


In [2]:
load_dotenv()

True

In [3]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="/Users/jfsg/biodiversity-rag-nlp/.env", override=True)


True

In [4]:
import os
print("Key from env file:", os.getenv("OPENAI_API_KEY"))
print("Key from os.environ directly:", os.environ.get("OPENAI_API_KEY"))


Key from env file: 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
Key from os.environ directly: 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


In [2]:
# 1. IMPORTS & SETUP
import os
from pathlib import Path
from dotenv import load_dotenv
import chromadb
from sentence_transformers import SentenceTransformer
from openai import OpenAI  # For new v1+ OpenAI client

# Load environment variables
load_dotenv()

# Initialize OpenAI client
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# 2. LOAD CHROMADB CLIENT AND COLLECTION
vector_db_path = Path("../data/vector_db")
chroma_client = chromadb.PersistentClient(path=str(vector_db_path))
collection = chroma_client.get_collection(name="biodiversity_docs")

# 3. LOAD EMBEDDING MODEL (same as used during indexing)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 4. DEFINE RAG QUERY PIPELINE
def retrieve_relevant_chunks(query, top_k=5):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    chunks = []
    citations = []

    for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
        similarity = 1 - dist
        citation = f"{meta['file_name']} | Chunk ID: {meta['chunk_id']} | Page: {meta.get('source_page', 'N/A')} | Similarity: {similarity:.4f}"
        chunks.append(doc)
        citations.append(citation)

    return chunks, citations

# 5. PROMPT COMPOSER
def compose_prompt(user_query, context_chunks):
    system_prompt = (
        "You are a helpful ESG research assistant. You answer questions using ONLY the provided context. "
        "Do not invent or hallucinate facts. Be precise and evidence-based.\n\n"
        "When applicable, summarize key ideas clearly.\n\n"
        "Answer in Markdown format."
    )
    context_text = "\n\n".join([f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])
    return f"{system_prompt}\n\n### User Question:\n{user_query}\n\n### Context:\n{context_text}\n\n### Answer:"

# 6. CALL OPENAI GPT-4
def ask_gpt4(prompt):
    response = openai_client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert assistant in ESG and biodiversity investing."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=1024
    )
    return response.choices[0].message.content.strip()

# 7. RAG + DISPLAY
def answer_query_with_rag(user_query, top_k=5):
    print("Retrieving relevant documents...")
    chunks, citations = retrieve_relevant_chunks(user_query, top_k=top_k)

    print("Assembling prompt for GPT-4...")
    prompt = compose_prompt(user_query, chunks)

    print("Querying GPT-4...")
    answer = ask_gpt4(prompt)

    print("\n--- Final Answer (Markdown Rendered Below) ---\n")
    print(answer)

    print("\n--- Bibliography ---\n")
    for i, citation in enumerate(citations):
        print(f"[{i+1}] {citation}")

# 8. TEST QUERY
if __name__ == "__main__":
    user_query = "What are the financial risks of biodiversity loss for institutional investors?"
    answer_query_with_rag(user_query, top_k=5)


Retrieving relevant documents...
Assembling prompt for GPT-4...
Querying GPT-4...


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-************************************************************************************************************************wjkA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [3]:
import openai
print(openai.__version__)


1.68.2
