In [None]:
!pip install langchain langchain-community openai tiktoken unstructured chromadb --quiet
!pip install beautifulsoup4 requests --quiet



In [None]:
!pip install langchain-google-genai faiss-cpu pypdf langchain  --quiet



In [None]:
!pip install google-generativeai==0.8.5 google-ai-generativelanguage==0.6.15 langchain-google-genai faiss-cpu pypdf langchain --quiet


# single page scrape for demo

## Website scraping

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://huyenchip.com/2024/07/25/genai-platform.html"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
print(text[:1000])  # just preview


In [None]:
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

print("Setup successful ✅")


##  Wrap the scraped text as a LangChain Document

In [None]:
from langchain.docstore.document import Document

docs = [Document(page_content=text, metadata={"source": url})]


## Split the document into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # characters
    chunk_overlap=200
)

chunks = text_splitter.split_documents(docs)
print(f"Number of chunks: {len(chunks)}")
print(chunks[0].page_content[:200])


## Create Embeddings + Store in FAISS

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS

# 1️⃣ Create the embedding model
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# 2️⃣ Create FAISS vector store from chunks
vectorstore = FAISS.from_documents(chunks, embedding_model)

# 3️⃣ Save the FAISS index locally (optional)
vectorstore.save_local("faiss_index")

In [None]:
!pip install --upgrade --quiet  langchain-community

## Load FAISS and Ask Questions

In [None]:
# Load the vectorstore
vectorstore = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

# Search for similar chunks
query = "What is a GenAI platform?"
docs = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(docs, start=1):
    print(f"Result {i}:")
    print(doc.page_content[:300])
    print("---")


## Use LLM for Final Answer (RAG)

In [None]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)


# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# Ask a question
response = qa_chain({"query": "What is a GenAI platform?"})
print("Answer:", response["result"])
print("\nSources:", [doc.metadata["source"] for doc in response["source_documents"]])


## Minimal working RAG pipeline

## Multi-URL Data Loader

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_url(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n")

urls = [
    "https://huyenchip.com/2024/07/25/genai-platform.html",
    "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/",
    "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
]

documents = []
for url in urls:
    text = scrape_url(url)
    documents.append({"url": url, "text": text})


## Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = []
for doc in documents:
    for chunk in text_splitter.split_text(doc["text"]):
        chunks.append({"text": chunk, "url": doc["url"]})


## Embeddings + Vector Store (FAISS)

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

texts = [c["text"] for c in chunks]
metadatas = [{"url": c["url"]} for c in chunks]

vectorstore = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)


## Retrieval + LLM Answer

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

def answer_query(query):
    docs = vectorstore.similarity_search(query, k=4)
    context = "\n\n".join([d.page_content for d in docs])
    sources = [d.metadata["url"] for d in docs]

    prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}\nAnswer with citations."
    response = llm.predict(prompt)
    return response, list(set(sources))

question = "What is a GenAI platform?"
answer, sources = answer_query(question)
print(answer)
print("Sources:", sources)


## Structure the Pipeline Properly

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# 1. Split text
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(text)

# 2. Create embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# 3. Store in FAISS
db = FAISS.from_texts(chunks, embeddings)


## Add the Retrieval Step
When a user asks something:

In [None]:
query = "What is a GenAI platform?"
docs = db.similarity_search(query, k=3)  # Get top 3 relevant chunks
context = " ".join([d.page_content for d in docs])

prompt = f"Answer based on the following context:\n{context}\n\nQuestion: {query}"
response = llm.invoke(prompt)
print(response)


## Wrap scraped text into Document objects

In [None]:
from langchain.schema import Document

docs = [Document(page_content=text, metadata={"source": url})]


## Chunk the text (so retrieval is more accurate)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)


## Embed chunks and store in FAISS

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
vectorstore = FAISS.from_documents(chunks, embeddings)


## Create retriever + chain

In [None]:
retriever = vectorstore.as_retriever()


## Ask questions with retrieval + Gemini

In [None]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")

qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
result = qa_chain.run("What is a GenAI platform?")
print(result)


## Step 1 — Data Ingestion from URLs

In [None]:
import os
import requests
from bs4 import BeautifulSoup

# Create data folder
data_dir = "/content/drive/MyDrive/RAG_demo/data"
os.makedirs(data_dir, exist_ok=True)

urls = [
    "https://huyenchip.com/2024/07/25/genai-platform.html",
    "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/",
    "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
]

def download_webpage(url, save_dir):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator="\n", strip=True)  # Extract clean text
        # Create filename from URL slug
        filename = url.split("/")[-1] or "index"
        if not filename.endswith(".txt"):
            filename += ".txt"
        save_path = os.path.join(save_dir, filename)
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"✅ Saved: {save_path}")
    else:
        print(f"❌ Failed: {url}")

for url in urls:
    download_webpage(url, data_dir)


## Render the page (reliable for JS-heavy sites) — use Playwright in Colab

In [None]:
!pip install playwright==1.49.0  # or latest
!playwright install chromium


In [None]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import os

async def render_and_save_text(url, save_path):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=['--no-sandbox','--disable-dev-shm-usage'])
        page = await browser.new_page(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/116")
        await page.goto(url, wait_until="networkidle", timeout=30000)
        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator="\n", strip=True)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(text)
    print("Saved:", save_path)

# Example:
save_path = "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt"
# Run the async function
await render_and_save_text("https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora", save_path)

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://lilianweng.github.io/posts/2024-07-07-hallucination/"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

# Remove scripts/styles
for tag in soup(["script", "style"]):
    tag.decompose()

text = soup.get_text(separator="\n", strip=True)
with open("/content/drive/MyDrive/RAG_demo/data/hallucination.txt", "w", encoding="utf-8") as f:
    f.write(text)


In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Load the article from URL
url = "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/"
loader = UnstructuredURLLoader(urls=[url])
docs = loader.load()

# 2. Add metadata for tracking
for doc in docs:
    doc.metadata["source"] = url

# 3. Split into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)

# 4. Add to your existing vectorstore
vectorstore.add_documents(split_docs)

print(f"✅ Re-ingested {len(split_docs)} chunks from {url}")


In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# 1. Load the article
url = "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/"
loader = UnstructuredURLLoader(urls=[url])
docs = loader.load()

# 2. Add metadata
for doc in docs:
    doc.metadata["source"] = url

# 3. Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)

# 4. Load existing FAISS vectorstore
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("/content/drive/MyDrive/RAG_demo/faiss_index", embeddings, allow_dangerous_deserialization=True)

# 5. Add documents (embeddings happen here automatically)
vectorstore.add_documents(split_docs)

# 6. Save updated FAISS
vectorstore.save_local("/content/drive/MyDrive/RAG_demo/faiss_index")

print(f"✅ Added {len(split_docs)} chunks from {url} and updated FAISS index.")


In [None]:
# Verify removal
sources = {doc.metadata.get("source", "Unknown") for doc in vectorstore.docstore._dict.values()}
print("\nRemaining sources:")
for s in sorted(sources):
    print("-", s)

In [None]:
from langchain_community.vectorstores import FAISS  # or Chroma, depending on your setup


# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# --- Step 1: Load your existing vectorstore ---
vectorstore_path = "/content/drive/MyDrive/RAG_demo/faiss_index"  # change if needed
vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)

# --- Step 2: Filter out docs with old source ---
docs_to_keep = []
metadatas_to_keep = []

for doc, metadata in zip(vectorstore.docstore._dict.values(), vectorstore.docstore._dict.values()):
    if "index.txt" not in metadata.metadata.get("source", ""):
        docs_to_keep.append(doc)
        metadatas_to_keep.append(metadata.metadata)

# --- Step 3: Rebuild vectorstore without old docs ---
new_vectorstore = FAISS.from_texts(
    [doc.page_content for doc in docs_to_keep],
    embeddings,
    metadatas=metadatas_to_keep
)

# --- Step 4: Save cleaned store ---
new_vectorstore.save_local(vectorstore_path)

print(f"Cleanup complete. Removed 'index.txt' entries. New total docs: {len(docs_to_keep)}")


## Step 2: Chunking & Embeddings

In [None]:
!pip install langchain faiss-cpu sentence-transformers langchain-community

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

# Path to your folder
folder_path = "/content/drive/MyDrive/RAG_demo/data"
faiss_index_path = os.path.join(folder_path, "faiss_index")

# Step 1: Load all .txt files
documents = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        loader = TextLoader(os.path.join(folder_path, file_name))
        documents.extend(loader.load())

print(f"Loaded {len(documents)} documents from {folder_path}")

# Step 2: Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # characters
    chunk_overlap=200  # overlap for better context
)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

# Step 3: Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Store in FAISS
db = FAISS.from_documents(docs, embeddings)

# Save FAISS index
db.save_local(faiss_index_path)
print("FAISS index saved successfully.")

In [None]:
!pip install langchain faiss-cpu sentence-transformers langchain-community --quiet

## Load FAISS Index

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import os

# Paths
folder_path = "/content/drive/MyDrive/RAG_demo/data"
faiss_index_path = os.path.join(folder_path, "faiss_index")

# Load the embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the index
vectorstore = FAISS.load_local(faiss_index_path, embeddings=embeddings, allow_dangerous_deserialization=True)

## Query the Index

In [None]:
query = "What is hallucination in AI?"
docs = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(docs, 1):
    print(f"\n--- Result {i} ---\n")
    print(doc.page_content)


## store the API Key

In [None]:
import os
from getpass import getpass

# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Gemini API key: ")

# Check (for debugging only; remove print later)
print("Key set successfully!")


## FAISS index + retrieval + LLM answering pipeline

In [None]:
# --- Load FAISS index and set up QA pipeline ---

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
from langchain_google_genai import ChatGoogleGenerativeAI

# 1. Load embeddings model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 2. Load FAISS index from path
FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"
faiss_index = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)


# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]  # force usage of API key
)


# 3. Initialize LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# 4. Create QA chain with custom prompt
template = """
Use the provided documents to answer the question.
If the answer is not in the documents, say:
"The answer is not available in the provided context."

Context:
{context}

Question: {question}
Answer:
"""
QA_PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=faiss_index.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": QA_PROMPT}
)

# 5. Wrap into a function
def query_docs(question):
    return qa.run(question)

# Example usage:
print(query_docs("What does the document say about RAG pipelines?"))


In [None]:
!pip install langchain-openai --quiet

In [None]:
# --- Step 1: Imports ---
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
from typing import List
from langchain_google_genai import ChatGoogleGenerativeAI
from getpass import getpass

# --- Step 2: Configuration ---
FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"
# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# --- Step 3: Load FAISS index ---
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

# --- Step 4: Setup LLM ---
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# --- Step 5: Create prompt template (optional for better control) ---
prompt_template = """
Use the provided context to answer the question.
If the answer is not in the context, say you don't know.
Also, return the source documents.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

# --- Step 6: Wrap into function ---
def query_with_citations(question: str):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    docs = retriever.get_relevant_documents(question)

    context = "\n\n".join([doc.page_content for doc in docs])
    formatted_prompt = prompt.format(context=context, question=question)

    response = llm.invoke(formatted_prompt)

    # Extract sources, remove duplicates, keep only file names
    sources = sorted({os.path.basename(doc.metadata.get("source", "Unknown")) for doc in docs})

    return {
        "answer": response.content.strip(),
        "sources": sources
    }

# --- Step 7: Test query ---
if __name__ == "__main__":
    user_query = "What is genai?"
    result = query_with_citations(user_query)
    print("\nAnswer:", result["answer"])
    print("Sources:", ", ".join(result["sources"]))


## Add Re-ranking

In [None]:
# --- Step 1: Imports ---
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
from typing import List
from langchain_google_genai import ChatGoogleGenerativeAI
from getpass import getpass

# --- Step 2: Configuration ---
FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"
# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# --- Step 3: Load FAISS index ---
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

# --- Step 4: Setup LLM ---
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# --- Step 5: Create prompt template ---
prompt_template = """
Use the provided context to answer the question.
If the answer is not in the context, say: I couldn't find anything in my knowledge base about that topic. I can only answer questions related to AI, RAG, and the documents you provided.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

# --- Step 6: Wrap into function ---
def query_with_citations(question: str):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 10})  # fetch more
    docs = retriever.get_relevant_documents(question)

    # Keep top 3 by vector similarity
    docs = docs[:3]

    context = "\n\n".join([doc.page_content for doc in docs])
    formatted_prompt = prompt.format(context=context, question=question)

    response = llm.invoke(formatted_prompt)
    answer_text = response.content.strip()

    # Only show sources if it's not the fallback
    fallback_msg = "I couldn't find anything in my knowledge base about that topic"
    if fallback_msg.lower() in answer_text.lower():
        sources = []  # No sources for out-of-scope answers
    else:
        sources = list(set(doc.metadata.get("source", "Unknown") for doc in docs))

    return {
        "answer": answer_text,
        "sources": sources
    }

# --- Step 7: Test query ---
if __name__ == "__main__":
    user_query = "what is genai?"
    result = query_with_citations(user_query)

    print("\nAnswer:", result["answer"])
    if result["sources"]:
        print("Sources:", ", ".join(result["sources"]))



In [None]:
# --- Step 0: Install needed packages ---
!pip install faiss-cpu langchain-community langchain-google-genai sentence-transformers --quiet

## Full RAG Pipeline with citations

In [None]:
# --- Step 1: Imports ---
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
from typing import List
from langchain_google_genai import ChatGoogleGenerativeAI
from getpass import getpass

# --- Step 2: Configuration ---
FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"
# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# --- Step 3: Load FAISS index ---
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

# --- Step 4: Setup LLM ---
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# --- Step 5: Create prompt template ---
prompt_template = """
Use the provided context to answer the question.
If the answer is not in the context, say: I couldn't find anything in my knowledge base about that topic. I can only answer questions related to AI, RAG, and the documents you provided.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

# --- Step 6: Wrap into function ---
def query_with_citations(question: str):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 10})  # fetch more
    docs = retriever.get_relevant_documents(question)

    # Keep top 3 by vector similarity
    docs = docs[:3]

    context = "\n\n".join([doc.page_content for doc in docs])
    formatted_prompt = prompt.format(context=context, question=question)

    response = llm.invoke(formatted_prompt)
    answer_text = response.content.strip()

    # Mapping from file paths to desired URL format
    source_url_map = {
        "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
        "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
        "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
    }

    def format_sources(sources):
        """Convert FAISS/metadata sources to consistent URL format."""
        return [source_url_map.get(src, src) for src in sources]

    # Only show sources if it's not the fallback
    fallback_msg = "I couldn't find anything in my knowledge base about that topic"
    if fallback_msg.lower() in answer_text.lower():
        sources = []  # No sources for out-of-scope answers
    else:
        sources = list(set(doc.metadata.get("source", "Unknown") for doc in docs))
        sources = format_sources(sources)  # <-- Apply URL mapping here

    return {
        "answer": answer_text,
        "sources": sources
    }

# --- Step 7: Test query ---
if __name__ == "__main__":
    user_query = "What is embedding based retieval?"
    result = query_with_citations(user_query)

    print("\nAnswer:", result["answer"])
    if result["sources"]:
        print("Sources:", ", ".join(result["sources"]))




# HYBRID RETRIEVAL (BM25 + FAISS) + reranker

In [None]:
!pip install rank_bm25 sentence-transformers faiss-cpu --quiet

In [None]:
# ===== HYBRID RETRIEVAL (BM25 + FAISS) + optional reranker =====
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
import os, pickle
from typing import List
from collections import OrderedDict

# ---to match your environment ---
FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"   # saved FAISS
USE_RERANKER = True           # set False if you don't want Cross-Encoder reranking
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"  # small & fast
BM25_TOP_K = 10               # initial BM25 candidates
FAISS_TOP_K = 10              # initial FAISS candidates
FINAL_TOP_K = 3               # final docs to pass to LLM
# ----------------------------------------------------------------


# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# ---Load FAISS index ---
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

# ---Setup LLM ---
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# ---Create prompt template ---
prompt_template = """
Use the provided context to answer the question.
If the answer is not in the context, say: I couldn't find anything in my knowledge base about that topic. I can only answer questions related to AI, RAG, and the documents you provided.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

# 1) Build BM25 corpus from your FAISS docstore (texts and metadata)
def build_bm25_from_vectorstore(vectorstore):
    docs = []
    sources = []
    # vectorstore.docstore._dict values are Document objects for many LangChain stores
    for doc_id, doc in vectorstore.docstore._dict.items():
        text = getattr(doc, "page_content", None) or doc.page_content
        docs.append(text)
        sources.append(doc.metadata.get("source", "Unknown"))
    # simple tokenization: whitespace split (you can improve using nltk/spacy if needed)
    tokenized = [d.split() for d in docs]
    bm25 = BM25Okapi(tokenized)
    return bm25, docs, tokenized, sources

print("Building BM25 index from vectorstore (this may take a sec)...")
bm25, bm25_docs, bm25_tokenized, bm25_sources = build_bm25_from_vectorstore(vectorstore)
print(f"BM25 built over {len(bm25_docs)} chunks.")

# Optionally persist BM25 for faster reload
BM25_PICKLE = os.path.join(FAISS_PATH, "bm25.pkl")
with open(BM25_PICKLE, "wb") as f:
    pickle.dump({"bm25_docs": bm25_docs, "bm25_sources": bm25_sources}, f)

# 2) Optional: load cross-encoder reranker
if USE_RERANKER:
    print("Loading cross-encoder reranker (may take memory/time)...")
    reranker = CrossEncoder(RERANKER_MODEL)
else:
    reranker = None

# 3) Helper: map doc text -> Document object(s) from vectorstore
# We'll build a quick map from text to the stored Document instance(s)
text_to_docs = {}
for doc in vectorstore.docstore._dict.values():
    text = doc.page_content
    text_to_docs.setdefault(text, []).append(doc)

# 4) Hybrid search function
def hybrid_search(query: str, bm_k:int=BM25_TOP_K, faiss_k:int=FAISS_TOP_K, top_k:int=FINAL_TOP_K) -> List:
    # 4a) BM25 candidates (by index of bm25_docs)
    tokenized_q = query.split()
    bm25_scores = bm25.get_scores(tokenized_q)
    bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:bm_k]
    bm25_candidates = []
    for idx in bm25_indices:
        txt = bm25_docs[idx]
        src = bm25_sources[idx]
        bm25_candidates.append((txt, src, bm25_scores[idx]))

    # 4b) FAISS candidates (semantic)
    faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": faiss_k})
    faiss_docs = faiss_retriever.get_relevant_documents(query)  # returns Document objects
    faiss_candidates = [(d.page_content, d.metadata.get("source","Unknown"), None) for d in faiss_docs]

    # 4c) Merge candidates (keep order by source of score: BM25 first then FAISS)
    merged = OrderedDict()  # preserve first appearance
    for txt, src, score in bm25_candidates + faiss_candidates:
        key = txt.strip()
        if key not in merged:
            merged[key] = {"text": txt, "source": src, "bm25_score": score}

    candidates = list(merged.values())  # list of dicts

    # 4d) (Optional) Rerank candidates with cross-encoder: produce scores for (query, text)
    if reranker and candidates:
        pairs = [(query, c["text"]) for c in candidates]
        scores = reranker.predict(pairs)
        for c, s in zip(candidates, scores):
            c["rerank_score"] = float(s)
        # Sort by rerank_score desc
        candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
    else:
        # fallback: keep bm25_score non-None then no particular order, or try simple heuristic:
        # prefer BM25 hits first (where bm25_score not None), then FAISS
        candidates = sorted(candidates, key=lambda x: (0 if x["bm25_score"] is not None else 1, -(x["bm25_score"] or 0)))

    # 4e) Return top_k Document objects (use text_to_docs map to return actual Document objs)
    final_docs = []
    for c in candidates[:top_k]:
        text = c["text"]
        docs_list = text_to_docs.get(text, [])
        if docs_list:
            # there may be multiple Doc objects with same text; pick first
            final_docs.append(docs_list[0])
        else:
            # Shouldn't happen if bm25 built from same corpus; but fallback: create pseudo doc
            from langchain.docstore.document import Document
            final_docs.append(Document(page_content=text, metadata={"source": c.get("source","Unknown")}))
    return final_docs

# 5) Integrate into your query_with_citations
import os
source_url_map = {
    "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
    "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
}
def format_sources(sources):
    return [source_url_map.get(s, s) for s in sources]

# Make sure prompt, llm exist in your notebook
def query_with_citations_hybrid(question: str):
    # Get hybrid top docs
    docs = hybrid_search(question, bm_k=BM25_TOP_K, faiss_k=FAISS_TOP_K, top_k=FINAL_TOP_K)

    # Create context and ask LLM
    context = "\n\n".join([doc.page_content for doc in docs])
    formatted_prompt = prompt.format(context=context, question=question)
    response = llm.invoke(formatted_prompt)
    answer_text = response.content.strip()

    # fallback detection (no sources shown for out-of-scope)
    fallback_msg = "i couldn't find anything in my knowledge base about that topic"
    if fallback_msg.lower() in answer_text.lower():
        sources = []
    else:
        sources = list(OrderedDict.fromkeys(doc.metadata.get("source","Unknown") for doc in docs))  # dedupe keep order
        sources = format_sources(sources)
    return {"answer": answer_text, "sources": sources}

# 6) Quick test
print("Running quick hybrid test...")
q = "What is ColBERT and why late interaction matters?"
res = query_with_citations_hybrid(q)
print("\nAnswer:", res["answer"][:400])
print("Sources:", res["sources"])


# EVALUATION METRICS

##  LLM-as-a-judge

In [None]:
# --- LLM Judge Setup ---
# Prompt you to enter API key securely
os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# Initialize a separate LLM for judging
judge_llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",  # You can choose a different model if needed
    temperature=0,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)


# Example ground truth set
gold_data = [
    {
        "question": "What is ColBERT and why late interaction matters?",
        "ground_truth": "ColBERT is a retrieval model developed at Stanford that uses BERT embeddings with a late interaction mechanism. Late interaction improves efficiency by separating query and document processing until the final scoring step, balancing accuracy with scalability.",
        "expected_citations": [
            "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/"
        ]
    }
    # Add more entries here...
]

def llm_judge(question, system_answer, citations, ground_truth, expected_citations, judge_llm):
    judge_prompt = f"""
    You are evaluating an AI system's answer.

    Question: {question}
    System Answer: {system_answer}
    Ground Truth Answer: {ground_truth}
    System Citations: {citations}
    Expected Citations: {expected_citations}

    Evaluate on:
    1. Accuracy (0 to 1): factual correctness compared to ground truth.
    2. Coverage (0 to 1): completeness of answer.
    3. Citation Match (0 to 1): citation relevance.

    Respond ONLY in JSON:
    {{ "accuracy": float, "coverage": float, "citation_match": float }}
    """

    response = judge_llm.invoke(judge_prompt) # Use judge_llm here
    return response.content

# --- Loop through gold dataset with hybrid retriever + judge ---
for item in gold_data:
    question = item["question"]
    ground_truth = item["ground_truth"]
    expected_citations = item["expected_citations"]

    # Run your hybrid retriever (make sure query_with_citations_hybrid is defined)
    result = query_with_citations_hybrid(question)
    answer = result["answer"]
    citations = result["sources"]

    # Judge evaluation
    scores = llm_judge(question, answer, citations, ground_truth, expected_citations, judge_llm)

    # Display results
    print(f"\nQ: {question}")
    print(f"Answer: {answer}")
    print(f"Citations: {citations}")
    print(f"Judge Scores: {scores}")

## LLM-as-a-Judge Evaluation

To assess the quality of answers generated by the hybrid retriever (BM25 + FAISS + Cross-Encoder Reranker), I integrated an LLM-as-a-judge evaluation layer. In this approach, a large language model reviews each retrieved answer against the cited sources and produces quantitative scores:

Accuracy – Measures factual correctness of the answer relative to the source (0 to 1).

Coverage – Evaluates how completely the answer addresses relevant information from the source (0 to 1).

Citation Match – Checks whether the cited references align with the actual content used (0 to 1).

For example, for the query "What is ColBERT and why late interaction matters?", the system generated the following metrics:

Metric	Score
Accuracy	0.95
Coverage	0.90
Citation Match	1.00

These scores indicate the system produced an accurate, well-covered, and perfectly cited answer. Running this evaluation across multiple queries provides a quantitative measure of retrieval and generation quality, making the system more robust.

## For testing only the retriever part before LLM answering

In [None]:
test_queries = [
    {
        "question": "What is embedding based retieval?",
        "relevant_sources": [
            "https://huyenchip.com/2024/07/25/genai-platform.html"
        ]
    },
    {
        "question": "Explain ColBERT model.",
        "relevant_sources": [
            "https://jina.ai/news/what-is-colbert-and-late-interaction-and-why-they-matter-in-search/"
        ]
    },
    {
        "question": "What causes hallucination?",
        "relevant_sources": [
            "https://lilianweng.github.io/posts/2024-07-07-hallucination/"
        ]
    },
    {
        "question": "why qdrant",
        "relevant_sources": [
            "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
        ]
    }
]


In [None]:
def evaluate_retrieval(vectorstore, test_queries, k=5):
    # Mapping local file paths to canonical URLs
    source_mapping = {
        "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
        "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
        "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
    }

    def normalize_source(source):
        return source_mapping.get(source, source)  # Replace if in mapping

    retriever = vectorstore.as_retriever(search_kwargs={"k": k})

    all_precisions = []
    all_recalls = []

    for tq in test_queries:
        docs = retriever.get_relevant_documents(tq["question"])

        # Normalize retrieved sources
        retrieved_sources = list(set(normalize_source(doc.metadata.get("source", "")) for doc in docs))

        # Convert to binary relevance
        y_true = [1 if src in tq["relevant_sources"] else 0 for src in retrieved_sources]

        if len(y_true) > 0:
            precision = sum(y_true) / len(y_true)  # TP / (TP+FP)
            recall = sum(y_true) / len(tq["relevant_sources"])  # TP / (TP+FN)
        else:
            precision, recall = 0, 0

        all_precisions.append(precision)
        all_recalls.append(recall)

    avg_precision = sum(all_precisions) / len(all_precisions)
    avg_recall = sum(all_recalls) / len(all_recalls)

    return avg_precision, avg_recall


# Example usage:
avg_p, avg_r = evaluate_retrieval(vectorstore, test_queries, k=5)
print(f"Average Precision@5: {avg_p:.2f}")
print(f"Average Recall@5: {avg_r:.2f}")


In [None]:
def normalize_source(src):
    """
    Normalizes source paths/URLs for matching.
    - Strips whitespace
    - Converts local file paths to their corresponding URLs if mapping exists
    - Lowercases for case-insensitive match
    """
    mapping = {
        "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
        "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
        "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
    }
    src = src.strip()
    return mapping.get(src, src).lower()


def evaluate_retrieval_with_per_query(vectorstore, test_queries, k=5):
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})

    all_precisions = []
    all_recalls = []

    print("\n=== Per-query Retrieval Evaluation (Normalized) ===")
    for tq in test_queries:
        docs = retriever.get_relevant_documents(tq["question"])

        retrieved_sources = list(set(normalize_source(doc.metadata.get("source", "")) for doc in docs))
        relevant_sources = [normalize_source(src) for src in tq["relevant_sources"]]

        # Compute precision & recall
        if retrieved_sources:
            tp = sum(1 for src in retrieved_sources if src in relevant_sources)
            precision = tp / len(retrieved_sources)
            recall = tp / len(relevant_sources)
        else:
            precision, recall = 0, 0

        all_precisions.append(precision)
        all_recalls.append(recall)

        print(f"Q: {tq['question']}")
        print(f"  Retrieved: {retrieved_sources}")
        print(f"  Relevant : {relevant_sources}")
        print(f"  Precision@{k}: {precision:.2f}, Recall@{k}: {recall:.2f}\n")

    avg_precision = sum(all_precisions) / len(all_precisions)
    avg_recall = sum(all_recalls) / len(all_recalls)

    print("=== Overall Averages ===")
    print(f"Average Precision@{k}: {avg_precision:.2f}")
    print(f"Average Recall@{k}: {avg_recall:.2f}")

    return avg_precision, avg_recall


# Example usage:
avg_p, avg_r = evaluate_retrieval_with_per_query(vectorstore, test_queries, k=5)


# Evaluation Summary

To ensure the quality and reliability of the RAG-powered question-answering system, two complementary evaluation approaches were implemented:

1. Quantitative Metrics (Precision & Recall)
Method: Evaluated using normalized source URLs against a ground-truth mapping for each query.

Metrics:

Average Precision@5: 0.71

Average Recall@5: 1.00

Interpretation: The system retrieved relevant documents with high completeness (100% recall) and strong precision, indicating that almost all relevant sources were captured within the top-5 results.

2. Qualitative Judgment (LLM-as-a-Judge)

Method: Integrated a Large Language Model to automatically assess answers based on:

Accuracy – factual correctness of the generated answer.

Coverage – completeness of the answer with respect to the question.

Citation Match – alignment of the cited sources with the retrieved documents.

Example Result:

Accuracy: 0.95

Coverage: 0.90

Citation Match: 1.00

Interpretation: The hybrid retrieval pipeline (BM25 + FAISS + reranker) produced highly accurate, complete, and source-grounded answers.

Overall Conclusion:

The system achieves strong performance in both retrieval relevance (quantitative) and answer quality (qualitative), making it suitable for real-world deployment where trustworthy and well-sourced responses are required.

# RAG API for single query with Dynamic Indexing

Hybrid Retrieval-Augmented Generation API with BM25 + FAISS and Dynamic Web Indexing

In [None]:
# Install packages
!pip install fastapi uvicorn pyngrok nest-asyncio --quiet
!pip install rank-bm25 sentence-transformers langchain-community langchain-google-genai faiss-cpu --quiet

In [None]:
# Setup imports and basic configuration
import os
import json
import time
import shutil
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import nest_asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, HTTPException, Depends, status, Body
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, validator
import logging
from getpass import getpass
from collections import OrderedDict
import hashlib
import re

# Import the RAG components
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Enable asyncio in Colab
nest_asyncio.apply()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✅ All imports successful!")

# Set up API keys
# Set Google API key
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# Set ngrok token
ngrok_token = getpass("Enter your ngrok token (get free at https://dashboard.ngrok.com/get-started/your-authtoken): ")
ngrok.set_auth_token(ngrok_token)

print("✅ API keys configured!")

# Define the FastAPI app
app = FastAPI(
    title="RAG API with Dynamic Indexing",
    description="Hybrid Retrieval-Augmented Generation API with BM25 + FAISS and Dynamic Web Indexing",
    version="2.0.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

security = HTTPBearer()

# Configuration
class Config:
    # Static index (existing)
    STATIC_FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"

    # Dynamic index (new)
    DYNAMIC_BASE_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index"
    DYNAMIC_FAISS_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/faiss_index"
    METADATA_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/metadata"
    BACKUP_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/backups"

    # Indexing settings
    USE_RERANKER = True
    RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    BM25_TOP_K = 10
    FAISS_TOP_K = 10
    FINAL_TOP_K = 3

    # Web scraping settings
    REQUEST_TIMEOUT = 30
    MAX_RETRIES = 3
    RETRY_DELAY = 2
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 100
    RE_INDEX_DAYS = 7
    MAX_VERSIONS = 3

    # Authentication
    VALID_API_KEYS = {
        "demo-api-key-123": {"user": "demo_user", "permissions": ["read", "query", "index"]},
    }

config = Config()

# Pydantic models
class IndexRequest(BaseModel):
    url: List[str] = Field(..., min_items=1, max_items=10, description="URLs to index")

    @validator('url')
    def validate_urls(cls, v):
        for url in v:
            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                raise ValueError(f"Invalid URL format: {url}")
        return v

class IndexResponse(BaseModel):
    status: str
    indexed_url: List[str]
    failed_url: Optional[List[Dict[str, str]]] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=datetime.now)

class QueryRequest(BaseModel):
    question: str = Field(..., min_length=1, max_length=1000, description="The question to ask")
    top_k: Optional[int] = Field(default=3, ge=1, le=10, description="Number of results to return")
    use_reranker: Optional[bool] = Field(default=True, description="Whether to use cross-encoder reranking")
    use_dynamic_index: Optional[bool] = Field(default=True, description="Whether to search dynamic index")

class QueryResponse(BaseModel):
    answer: str
    sources: List[str]
    metadata: dict = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=datetime.now)

class HealthResponse(BaseModel):
    status: str
    timestamp: datetime = Field(default_factory=datetime.now)
    components: dict

# Global variables
embedding_model = None
static_vectorstore = None
dynamic_vectorstore = None
llm = None
bm25_static = None
bm25_dynamic = None
bm25_docs_static = None
bm25_docs_dynamic = None
bm25_sources_static = None
bm25_sources_dynamic = None
reranker = None
text_to_docs_static = None
text_to_docs_dynamic = None
prompt = None
text_splitter = None

print("✅ FastAPI app configured!")

# Authentication
async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    """Verify API key from Authorization header"""
    api_key = credentials.credentials
    if api_key not in config.VALID_API_KEYS:
        logger.warning(f"Invalid API key attempted: {api_key[:10]}...")
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid API key",
            headers={"WWW-Authenticate": "Bearer"},
        )
    return config.VALID_API_KEYS[api_key]

# Utility functions
def ensure_directories():
    """Ensure all required directories exist"""
    directories = [
        config.DYNAMIC_BASE_PATH,
        config.DYNAMIC_FAISS_PATH,
        config.METADATA_PATH,
        config.BACKUP_PATH
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    logger.info("✅ Directory structure created")

def get_url_hash(url: str) -> str:
    """Generate a hash for URL to use as unique identifier"""
    return hashlib.md5(url.encode()).hexdigest()

def load_metadata(file_path: str) -> Dict:
    """Load metadata from JSON file"""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading metadata from {file_path}: {str(e)}")
    return {}

def save_metadata(data: Dict, file_path: str):
    """Save metadata to JSON file"""
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, default=str)
    except Exception as e:
        logger.error(f"Error saving metadata to {file_path}: {str(e)}")
        raise

def should_reindex_url(url: str, metadata: Dict) -> bool:
    """Check if URL should be re-indexed"""
    url_hash = get_url_hash(url)
    if url_hash not in metadata:
        return True

    last_indexed = datetime.fromisoformat(metadata[url_hash]['timestamp'])
    age_days = (datetime.now() - last_indexed).days

    return age_days >= config.RE_INDEX_DAYS

# Web scraping functions
class WebScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def extract_content(self, url: str) -> Dict[str, Any]:
        """Extract content from URL with retry logic"""
        for attempt in range(config.MAX_RETRIES):
            try:
                response = self.session.get(url, timeout=config.REQUEST_TIMEOUT)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Remove unwanted elements
                for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
                    element.decompose()

                # Extract title
                title = soup.find('title')
                title_text = title.get_text().strip() if title else url

                # Extract main content
                content_selectors = [
                    'article', 'main', '[role="main"]',
                    '.content', '.post', '.article',
                    'div.container', 'div.wrapper'
                ]

                content_text = ""
                for selector in content_selectors:
                    content = soup.select_one(selector)
                    if content:
                        content_text = content.get_text()
                        break

                if not content_text:
                    # Fallback to body
                    body = soup.find('body')
                    content_text = body.get_text() if body else ""

                # Clean text
                content_text = re.sub(r'\s+', ' ', content_text).strip()

                if not content_text:
                    raise ValueError("No content extracted")

                return {
                    'title': title_text,
                    'content': content_text,
                    'url': url,
                    'success': True,
                    'error': None
                }

            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
                if attempt < config.MAX_RETRIES - 1:
                    time.sleep(config.RETRY_DELAY * (2 ** attempt))
                else:
                    return {
                        'title': None,
                        'content': None,
                        'url': url,
                        'success': False,
                        'error': str(e)
                    }

# FAISS management functions
def create_version_backup():
    """Create a backup of current version"""
    if not os.path.exists(config.DYNAMIC_FAISS_PATH):
        return

    versions_file = os.path.join(config.METADATA_PATH, 'versions.json')
    versions = load_metadata(versions_file)

    current_version = versions.get('current_version', 0)
    backup_dir = os.path.join(config.BACKUP_PATH, f'v{current_version}_backup')

    if os.path.exists(config.DYNAMIC_FAISS_PATH):
        if os.path.exists(backup_dir):
            shutil.rmtree(backup_dir)
        shutil.copytree(config.DYNAMIC_FAISS_PATH, backup_dir)
        logger.info(f"✅ Created backup: v{current_version}")

def cleanup_old_backups():
    """Remove old backup versions"""
    versions_file = os.path.join(config.METADATA_PATH, 'versions.json')
    versions = load_metadata(versions_file)
    current_version = versions.get('current_version', 0)

    for i in range(max(0, current_version - config.MAX_VERSIONS)):
        old_backup = os.path.join(config.BACKUP_PATH, f'v{i}_backup')
        if os.path.exists(old_backup):
            shutil.rmtree(old_backup)
            logger.info(f"🗑️ Removed old backup: v{i}")

def update_version():
    """Update version metadata"""
    versions_file = os.path.join(config.METADATA_PATH, 'versions.json')
    versions = load_metadata(versions_file)

    new_version = versions.get('current_version', 0) + 1
    versions.update({
        'current_version': new_version,
        'last_updated': datetime.now().isoformat(),
        'total_updates': versions.get('total_updates', 0) + 1
    })

    save_metadata(versions, versions_file)
    return new_version

def build_bm25_from_vectorstore(vectorstore):
    """Build BM25 index from FAISS vectorstore"""
    try:
        docs = []
        sources = []
        for doc_id, doc in vectorstore.docstore._dict.items():
            text = getattr(doc, "page_content", None) or doc.page_content
            docs.append(text)
            sources.append(doc.metadata.get("source", "Unknown"))

        tokenized = [d.split() for d in docs]
        bm25 = BM25Okapi(tokenized)

        # Build text to docs mapping
        text_to_docs = {}
        for doc in vectorstore.docstore._dict.values():
            text = doc.page_content
            text_to_docs.setdefault(text, []).append(doc)

        logger.info(f"BM25 built over {len(docs)} chunks")
        return bm25, docs, sources, text_to_docs
    except Exception as e:
        logger.error(f"Error building BM25: {str(e)}")
        raise

# Source URL mapping
source_url_map = {
    "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
    "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
}

def format_sources(sources):
    """Convert file paths to URLs"""
    formatted = []
    for source in sources:
        if source.startswith('http'):
            formatted.append(source)
        else:
            formatted.append(source_url_map.get(source, source))
    return formatted

def hybrid_search(query: str, use_dynamic: bool = True, bm_k: int = 10, faiss_k: int = 10, top_k: int = 3, use_reranker: bool = True):
    """Perform hybrid search on static and/or dynamic indices"""
    try:
        all_candidates = []

        # Search static index
        if static_vectorstore:
            candidates_static = _search_single_index(
                query, static_vectorstore, bm25_static, bm25_docs_static,
                bm25_sources_static, text_to_docs_static, bm_k, faiss_k
            )
            all_candidates.extend(candidates_static)

        # Search dynamic index
        if use_dynamic and dynamic_vectorstore:
            candidates_dynamic = _search_single_index(
                query, dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic,
                bm25_sources_dynamic, text_to_docs_dynamic, bm_k, faiss_k
            )
            all_candidates.extend(candidates_dynamic)

        if not all_candidates:
            return []

        # Merge and deduplicate candidates
        merged = OrderedDict()
        for candidate in all_candidates:
            key = candidate["text"].strip()
            if key not in merged or (candidate.get("bm25_score") or 0) > (merged[key].get("bm25_score") or 0):
                merged[key] = candidate

        candidates = list(merged.values())

        # Optional reranking
        if use_reranker and reranker and candidates:
            pairs = [(query, c["text"]) for c in candidates]
            scores = reranker.predict(pairs)
            for c, s in zip(candidates, scores):
                c["rerank_score"] = float(s)
            candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
        else:
            candidates = sorted(candidates, key=lambda x: (0 if x["bm25_score"] is not None else 1, -(x["bm25_score"] or 0)))

        # Return top_k Document objects
        final_docs = []
        for c in candidates[:top_k]:
            text = c["text"]
            # Try to find in both mappings
            docs_list = text_to_docs_static.get(text, []) if text_to_docs_static else []
            if not docs_list and text_to_docs_dynamic:
                docs_list = text_to_docs_dynamic.get(text, [])

            if docs_list:
                final_docs.append(docs_list[0])
            else:
                final_docs.append(Document(page_content=text, metadata={"source": c.get("source", "Unknown")}))

        return final_docs

    except Exception as e:
        logger.error(f"Error in hybrid search: {str(e)}")
        raise

def _search_single_index(query: str, vectorstore, bm25, bm25_docs, bm25_sources, text_to_docs, bm_k: int, faiss_k: int):
    """Search a single index (static or dynamic)"""
    candidates = []

    # BM25 candidates
    if bm25 and bm25_docs:
        tokenized_q = query.split()
        bm25_scores = bm25.get_scores(tokenized_q)
        bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:bm_k]
        for idx in bm25_indices:
            txt = bm25_docs[idx]
            src = bm25_sources[idx]
            candidates.append({"text": txt, "source": src, "bm25_score": bm25_scores[idx]})

    # FAISS candidates
    if vectorstore:
        faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": faiss_k})
        faiss_docs = faiss_retriever.get_relevant_documents(query)
        for d in faiss_docs:
            candidates.append({"text": d.page_content, "source": d.metadata.get("source", "Unknown"), "bm25_score": None})

    return candidates

print("✅ Helper functions defined!")

# Initialize models
def initialize_models():
    """Initialize all models and components"""
    global embedding_model, static_vectorstore, dynamic_vectorstore, llm
    global bm25_static, bm25_dynamic, bm25_docs_static, bm25_docs_dynamic
    global bm25_sources_static, bm25_sources_dynamic, reranker
    global text_to_docs_static, text_to_docs_dynamic, prompt, text_splitter

    try:
        print("🔄 Loading models...")
        ensure_directories()

        # Load embedding model
        embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        print("✅ Embedding model loaded")

        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        print("✅ Text splitter initialized")

        # Load static FAISS vectorstore
        try:
            static_vectorstore = FAISS.load_local(
                config.STATIC_FAISS_PATH,
                embedding_model,
                allow_dangerous_deserialization=True
            )
            bm25_static, bm25_docs_static, bm25_sources_static, text_to_docs_static = build_bm25_from_vectorstore(static_vectorstore)
            print("✅ Static FAISS vectorstore loaded")
        except Exception as e:
            print(f"⚠️ Static FAISS not found: {str(e)}")

        # Load dynamic FAISS vectorstore (if exists)
        try:
            dynamic_vectorstore = FAISS.load_local(
                config.DYNAMIC_FAISS_PATH,
                embedding_model,
                allow_dangerous_deserialization=True
            )
            bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)
            print("✅ Dynamic FAISS vectorstore loaded")
        except Exception as e:
            print(f"ℹ️ Dynamic FAISS not found (will create on first index): {str(e)}")

        # Initialize LLM
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            google_api_key=os.environ["GOOGLE_API_KEY"]
        )
        print("✅ LLM initialized")

        # Create prompt template
        prompt_template = """
        Use the provided context to answer the question.
        If the answer is not in the context, say: I couldn't find anything in my knowledge base about that topic. I can only answer questions related to AI, RAG, and the documents you provided.

        Context:
        {context}

        Question:
        {question}

        Answer:
        """
        prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
        print("✅ Prompt template created")

        # Load reranker
        if config.USE_RERANKER:
            reranker = CrossEncoder(config.RERANKER_MODEL)
            print("✅ Reranker loaded")

        print("🎉 All models initialized successfully!")

    except Exception as e:
        print(f"❌ Error initializing models: {str(e)}")
        raise

# Run the initialization
initialize_models()

# API Endpoints
@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    components = {
        "static_vectorstore": static_vectorstore is not None,
        "dynamic_vectorstore": dynamic_vectorstore is not None,
        "llm": llm is not None,
        "bm25_static": bm25_static is not None,
        "bm25_dynamic": bm25_dynamic is not None,
        "reranker": reranker is not None if config.USE_RERANKER else "disabled"
    }
    status = "healthy" if llm is not None else "unhealthy"
    return HealthResponse(status=status, components=components)

@app.post("/api/v1/index", response_model=IndexResponse)
async def index_urls(
    request: IndexRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Index URLs into the dynamic vector database"""
    global dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic

    # Check permissions
    if "index" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for indexing"
        )

    logger.info(f"Indexing request from {user_info['user']}: {len(request.url)} URLs")

    scraper = WebScraper()
    indexed_urls = []
    failed_urls = []

    # Load existing metadata
    urls_metadata_file = os.path.join(config.METADATA_PATH, 'indexed_urls.json')
    urls_metadata = load_metadata(urls_metadata_file)

    try:
        # Create backup before major changes
        create_version_backup()

        new_documents = []

        for url in request.url:
            try:
                # Check if we should reindex
                if not should_reindex_url(url, urls_metadata):
                    logger.info(f"Skipping {url} - recently indexed")
                    continue

                logger.info(f"Processing URL: {url}")

                # Extract content
                result = scraper.extract_content(url)

                if not result['success']:
                    failed_urls.append({
                        "url": url,
                        "error": result['error'],
                        "error_type": "EXTRACTION_FAILED"
                    })
                    continue

                # Split content into chunks
                chunks = text_splitter.split_text(result['content'])

                # Create documents
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={
                            "source": url,
                            "title": result['title'],
                            "chunk_id": i,
                            "total_chunks": len(chunks),
                            "indexed_at": datetime.now().isoformat(),
                            "url_hash": get_url_hash(url)
                        }
                    )
                    new_documents.append(doc)

                # Update URL metadata
                url_hash = get_url_hash(url)
                urls_metadata[url_hash] = {
                    "url": url,
                    "title": result['title'],
                    "timestamp": datetime.now().isoformat(),
                    "chunk_count": len(chunks),
                    "status": "indexed"
                }

                indexed_urls.append(url)
                logger.info(f"✅ Successfully processed {url} - {len(chunks)} chunks")

            except Exception as e:
                logger.error(f"Error processing {url}: {str(e)}")
                failed_urls.append({
                    "url": url,
                    "error": str(e),
                    "error_type": "PROCESSING_ERROR"
                })

        # Update vector database if we have new documents
        if new_documents:
            try:
                if dynamic_vectorstore is None:
                    # Create new FAISS index
                    dynamic_vectorstore = FAISS.from_documents(new_documents, embedding_model)
                    logger.info("✅ Created new dynamic FAISS index")
                else:
                    # Add to existing index
                    dynamic_vectorstore.add_documents(new_documents)
                    logger.info(f"✅ Added {len(new_documents)} documents to existing index")

                # Save updated index
                dynamic_vectorstore.save_local(config.DYNAMIC_FAISS_PATH)

                # Rebuild BM25 and mappings
                bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)

                # Update version
                new_version = update_version()

                # Cleanup old backups
                cleanup_old_backups()

                logger.info(f"✅ Dynamic index updated to version {new_version}")

            except Exception as e:
                logger.error(f"Error updating vector database: {str(e)}")
                raise HTTPException(
                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                    detail=f"Failed to update vector database: {str(e)}"
                )

        # Save URL metadata
        save_metadata(urls_metadata, urls_metadata_file)

        # Prepare response
        response_status = "success" if indexed_urls else "failed"
        if indexed_urls and failed_urls:
            response_status = "partial_success"

        metadata = {
            "total_requested": len(request.url),
            "successfully_indexed": len(indexed_urls),
            "failed": len(failed_urls),
            "new_documents_added": len(new_documents),
            "user": user_info["user"]
        }

        return IndexResponse(
            status=response_status,
            indexed_url=indexed_urls,
            failed_url=failed_urls if failed_urls else None,
            metadata=metadata
        )

    except Exception as e:
        logger.error(f"Critical error in indexing: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Indexing failed: {str(e)}"
        )

@app.post("/query", response_model=QueryResponse)
async def query_rag(
    request: QueryRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Main RAG query endpoint"""
    try:
        logger.info(f"Query from user {user_info['user']}: {request.question[:50]}...")

        # Perform hybrid search
        docs = hybrid_search(
            request.question,
            use_dynamic=request.use_dynamic_index,
            bm_k=config.BM25_TOP_K,
            faiss_k=config.FAISS_TOP_K,
            top_k=request.top_k,
            use_reranker=request.use_reranker and config.USE_RERANKER
        )

        if not docs:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND,
                detail="No relevant documents found"
            )

        # Create context and query LLM
        context = "\n\n".join([doc.page_content for doc in docs])
        formatted_prompt = prompt.format(context=context, question=request.question)
        response = llm.invoke(formatted_prompt)
        answer_text = response.content.strip()

        # Check for out-of-scope responses
        fallback_msg = "i couldn't find anything in my knowledge base about that topic"
        if fallback_msg.lower() in answer_text.lower():
            sources = []
            logger.info(f"Out-of-scope query: {request.question}")
        else:
            sources = list(OrderedDict.fromkeys(doc.metadata.get("source", "Unknown") for doc in docs))
            sources = format_sources(sources)

        metadata = {
            "query_length": len(request.question),
            "num_docs_retrieved": len(docs),
            "reranker_used": request.use_reranker and config.USE_RERANKER,
            "dynamic_index_used": request.use_dynamic_index,
            "static_index_available": static_vectorstore is not None,
            "dynamic_index_available": dynamic_vectorstore is not None,
            "user": user_info["user"]
        }

        return QueryResponse(
            answer=answer_text,
            sources=sources,
            metadata=metadata
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing query: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}"
        )

@app.get("/api/v1/sources")
async def list_sources(user_info: dict = Depends(verify_api_key)):
    """List available sources in both static and dynamic knowledge bases"""
    static_sources = set()
    dynamic_sources = set()

    # Get static sources
    if static_vectorstore:
        for doc in static_vectorstore.docstore._dict.values():
            source = doc.metadata.get("source", "Unknown")
            static_sources.add(source)

    # Get dynamic sources
    if dynamic_vectorstore:
        for doc in dynamic_vectorstore.docstore._dict.values():
            source = doc.metadata.get("source", "Unknown")
            dynamic_sources.add(source)

    # Load indexed URLs metadata
    urls_metadata_file = os.path.join(config.METADATA_PATH, 'indexed_urls.json')
    urls_metadata = load_metadata(urls_metadata_file)

    # Load versions metadata
    versions_file = os.path.join(config.METADATA_PATH, 'versions.json')
    versions = load_metadata(versions_file)

    return {
        "static_sources": {
            "sources": format_sources(list(static_sources)),
            "total_documents": len(static_vectorstore.docstore._dict) if static_vectorstore else 0
        },
        "dynamic_sources": {
            "sources": list(dynamic_sources),
            "total_documents": len(dynamic_vectorstore.docstore._dict) if dynamic_vectorstore else 0,
            "indexed_urls_count": len(urls_metadata),
            "current_version": versions.get("current_version", 0),
            "last_updated": versions.get("last_updated")
        },
        "metadata": {
            "total_urls_indexed": len(urls_metadata),
            "total_documents": (len(static_vectorstore.docstore._dict) if static_vectorstore else 0) +
                             (len(dynamic_vectorstore.docstore._dict) if dynamic_vectorstore else 0)
        },
        "timestamp": datetime.now()
    }

@app.get("/api/v1/index/status")
async def get_index_status(user_info: dict = Depends(verify_api_key)):
    """Get detailed status of the dynamic indexing system"""

    # Load metadata files
    urls_metadata_file = os.path.join(config.METADATA_PATH, 'indexed_urls.json')
    urls_metadata = load_metadata(urls_metadata_file)

    versions_file = os.path.join(config.METADATA_PATH, 'versions.json')
    versions = load_metadata(versions_file)

    # Calculate statistics
    recent_urls = []
    old_urls = []
    now = datetime.now()

    for url_hash, url_data in urls_metadata.items():
        indexed_time = datetime.fromisoformat(url_data['timestamp'])
        age_days = (now - indexed_time).days

        url_info = {
            "url": url_data['url'],
            "title": url_data.get('title', ''),
            "indexed_at": url_data['timestamp'],
            "age_days": age_days,
            "chunk_count": url_data.get('chunk_count', 0)
        }

        if age_days < config.RE_INDEX_DAYS:
            recent_urls.append(url_info)
        else:
            old_urls.append(url_info)

    # Get backup information
    backup_versions = []
    if os.path.exists(config.BACKUP_PATH):
        for item in os.listdir(config.BACKUP_PATH):
            if item.endswith('_backup'):
                version_num = item.replace('_backup', '').replace('v', '')
                backup_path = os.path.join(config.BACKUP_PATH, item)
                backup_size = sum(os.path.getsize(os.path.join(backup_path, f))
                                for f in os.listdir(backup_path) if os.path.isfile(os.path.join(backup_path, f)))
                backup_versions.append({
                    "version": version_num,
                    "size_bytes": backup_size,
                    "created": datetime.fromtimestamp(os.path.getctime(backup_path)).isoformat()
                })

    return {
        "system_status": {
            "dynamic_index_exists": dynamic_vectorstore is not None,
            "current_version": versions.get("current_version", 0),
            "total_updates": versions.get("total_updates", 0),
            "last_updated": versions.get("last_updated")
        },
        "url_statistics": {
            "total_indexed": len(urls_metadata),
            "recent_urls": len(recent_urls),
            "outdated_urls": len(old_urls),
            "re_index_threshold_days": config.RE_INDEX_DAYS
        },
        "recent_urls": recent_urls[:10],  # Show last 10 recent URLs
        "outdated_urls": old_urls[:5],    # Show first 5 outdated URLs
        "backup_info": {
            "available_backups": backup_versions,
            "max_versions_kept": config.MAX_VERSIONS
        },
        "configuration": {
            "chunk_size": config.CHUNK_SIZE,
            "chunk_overlap": config.CHUNK_OVERLAP,
            "max_retries": config.MAX_RETRIES,
            "request_timeout": config.REQUEST_TIMEOUT
        },
        "timestamp": datetime.now()
    }

@app.delete("/api/v1/index/{url_hash}")
async def remove_indexed_url(
    url_hash: str,
    user_info: dict = Depends(verify_api_key)
):
    """Remove a specific indexed URL from the dynamic database"""
    global dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic

    # Check permissions
    if "index" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for index modification"
        )

    # Load URL metadata
    urls_metadata_file = os.path.join(config.METADATA_PATH, 'indexed_urls.json')
    urls_metadata = load_metadata(urls_metadata_file)

    if url_hash not in urls_metadata:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="URL hash not found in index"
        )

    try:
        # Create backup before modification
        create_version_backup()

        # Find and remove documents with this URL hash
        if dynamic_vectorstore:
            # This is a complex operation - FAISS doesn't support direct deletion
            # We need to rebuild the index without the target documents
            remaining_docs = []
            removed_count = 0

            for doc_id, doc in dynamic_vectorstore.docstore._dict.items():
                if doc.metadata.get("url_hash") != url_hash:
                    remaining_docs.append(doc)
                else:
                    removed_count += 1

            if remaining_docs:
                # Rebuild the vectorstore with remaining documents
                dynamic_vectorstore = FAISS.from_documents(remaining_docs, embedding_model)
                dynamic_vectorstore.save_local(config.DYNAMIC_FAISS_PATH)

                # Rebuild BM25 and mappings
                bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)
            else:
                # No documents left, remove the vectorstore
                dynamic_vectorstore = None
                bm25_dynamic = None
                bm25_docs_dynamic = None
                bm25_sources_dynamic = None
                text_to_docs_dynamic = None

                # Remove the directory
                if os.path.exists(config.DYNAMIC_FAISS_PATH):
                    shutil.rmtree(config.DYNAMIC_FAISS_PATH)
                    os.makedirs(config.DYNAMIC_FAISS_PATH)

        # Remove from URL metadata
        url_info = urls_metadata.pop(url_hash)
        save_metadata(urls_metadata, urls_metadata_file)

        # Update version
        new_version = update_version()
        cleanup_old_backups()

        logger.info(f"✅ Removed URL {url_info['url']} from index")

        return {
            "status": "success",
            "removed_url": url_info['url'],
            "documents_removed": removed_count,
            "new_version": new_version,
            "timestamp": datetime.now()
        }

    except Exception as e:
        logger.error(f"Error removing URL: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to remove URL: {str(e)}"
        )

@app.post("/api/v1/index/reindex")
async def reindex_all_urls(user_info: dict = Depends(verify_api_key)):
    """Force reindex all URLs in the metadata"""
    # Check permissions
    if "index" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for reindexing"
        )

    # Load URL metadata
    urls_metadata_file = os.path.join(config.METADATA_PATH, 'indexed_urls.json')
    urls_metadata = load_metadata(urls_metadata_file)

    if not urls_metadata:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="No URLs found to reindex"
        )

    # Extract URLs and create reindex request
    urls_to_reindex = [data['url'] for data in urls_metadata.values()]

    # Clear existing metadata to force reindexing
    save_metadata({}, urls_metadata_file)

    # Create index request
    reindex_request = IndexRequest(url=urls_to_reindex)

    # Call the index endpoint
    return await index_urls(reindex_request, user_info)

print("✅ API endpoints defined!")

# Start the server
def start_server():
    """Start the FastAPI server with ngrok"""
    # Create ngrok tunnel
    public_url = ngrok.connect(8000)

    print(f"🌐 Public URL: {public_url}")
    print(f"📚 API Documentation: {public_url}/docs")
    print(f"🔑 API Key: demo-api-key-123")
    print("\n📋 Available Endpoints:")
    print(f"  • POST {public_url}/api/v1/index - Index new URLs")
    print(f"  • POST {public_url}/query - Query the RAG system")
    print(f"  • GET  {public_url}/api/v1/sources - List all sources")
    print(f"  • GET  {public_url}/api/v1/index/status - Get index status")
    print(f"  • DELETE {public_url}/api/v1/index/{{url_hash}} - Remove indexed URL")
    print(f"  • POST {public_url}/api/v1/index/reindex - Reindex all URLs")
    print(f"  • GET  {public_url}/health - Health check")
    print("=" * 50)

    # Example curl commands
    print("💡 Example Usage:")
    print(f"""
# Index a URL:
curl -X POST "{public_url}/api/v1/index" \\
  -H "Authorization: Bearer demo-api-key-123" \\
  -H "Content-Type: application/json" \\
  -d '{{"url": ["https://example.com"]}}'

# Query the system:
curl -X POST "{public_url}/query" \\
  -H "Authorization: Bearer demo-api-key-123" \\
  -H "Content-Type: application/json" \\
  -d '{{"question": "What is RAG?", "use_dynamic_index": true}}'

# Check index status:
curl -X GET "{public_url}/api/v1/index/status" \\
  -H "Authorization: Bearer demo-api-key-123"
    """)
    print("=" * 50)

    # Start the server
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Run the server
start_server()

In [None]:
# ========================================
# STEP 1: SETUP AND IMPORTS
# ========================================

!pip install -q fastapi uvicorn pyngrok nest-asyncio --quiet
!pip install -q rank-bm25 sentence-transformers langchain-community langchain-google-genai --quiet
!pip install -q faiss-cpu beautifulsoup4 requests pydantic --quiet
!pip install -q langchain scikit-learn pandas numpy  --quiet


# Conversational RAG API with Dynamic Indexing and Sources

In [None]:
# Setup imports and basic configuration
import os
import json
import time
import shutil
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any, Union
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import nest_asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI, HTTPException, Depends, status, Body
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, validator
import logging
from getpass import getpass
from collections import OrderedDict
import hashlib
import re
import uuid
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from threading import Thread

# Import the RAG components
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Enable asyncio in Colab
nest_asyncio.apply()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✅ All imports successful!")

# ========================================
# STEP 2: CONFIGURATION AND API KEYS
# ========================================

# Set up API keys
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

# Set ngrok token
ngrok_token = getpass("Enter your ngrok token (get free at https://dashboard.ngrok.com/get-started/your-authtoken): ")
ngrok.set_auth_token(ngrok_token)

# Configuration
class Config:
    # Static index (existing)
    STATIC_FAISS_PATH = "/content/drive/MyDrive/RAG_demo/faiss_index"

    # Dynamic index (new)
    DYNAMIC_BASE_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index"
    DYNAMIC_FAISS_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/faiss_index"
    METADATA_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/metadata"
    BACKUP_PATH = "/content/drive/MyDrive/RAG_demo/dynamic_index/backups"

    # Conversation settings
    CONVERSATIONS_PATH = "/content/drive/MyDrive/RAG_demo/conversations"
    MAX_CONVERSATION_LENGTH = 10  # Maximum number of message pairs
    CONVERSATION_TIMEOUT = 1800   # 30 minutes in seconds
    MAX_CONTEXT_LENGTH = 3        # Last N message pairs to include in context

    # Indexing settings
    USE_RERANKER = True
    RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    BM25_TOP_K = 10
    FAISS_TOP_K = 10
    FINAL_TOP_K = 3

    # Web scraping settings
    REQUEST_TIMEOUT = 30
    MAX_RETRIES = 3
    RETRY_DELAY = 2
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 100
    RE_INDEX_DAYS = 7
    MAX_VERSIONS = 3

    # Evaluation settings
    EVALUATION_DATASET_PATH = "/content/drive/MyDrive/RAG_demo/evaluation"

    # Authentication
    VALID_API_KEYS = {
        "demo-api-key-123": {"user": "demo_user", "permissions": ["read", "query", "index", "chat"]},
        "eval-key-456": {"user": "evaluator", "permissions": ["read", "query", "chat", "eval"]},
    }

config = Config()
print("✅ Configuration loaded!")

# ========================================
# STEP 3: PYDANTIC MODELS
# ========================================

# Chat-specific models
class ChatMessage(BaseModel):
    role: str = Field(..., pattern="^(user|assistant)$")
    content: str = Field(..., min_length=1, max_length=2000)
    timestamp: Optional[datetime] = Field(default_factory=datetime.now)

class ChatRequest(BaseModel):
    messages: List[ChatMessage] = Field(..., min_items=1, max_items=20)
    session_id: Optional[str] = Field(default=None)
    use_dynamic_index: Optional[bool] = Field(default=True)
    use_reranker: Optional[bool] = Field(default=True)
    top_k: Optional[int] = Field(default=3, ge=1, le=5)

class ChatResponse(BaseModel):
    session_id: str
    response: Dict[str, Any]
    conversation_length: int
    timestamp: datetime = Field(default_factory=datetime.now)

# Original models (updated)
class IndexRequest(BaseModel):
    url: List[str] = Field(..., min_items=1, max_items=5, description="URLs to index")

    @validator('url')
    def validate_urls(cls, v):
        for url in v:
            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                raise ValueError(f"Invalid URL format: {url}")
        return v

class IndexResponse(BaseModel):
    status: str
    indexed_url: List[str]
    failed_url: Optional[List[Dict[str, str]]] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=datetime.now)

class HealthResponse(BaseModel):
    status: str
    timestamp: datetime = Field(default_factory=datetime.now)
    components: dict
    conversations_active: int

# NEW: Sources models
class SourceInfo(BaseModel):
    source_url: str
    title: Optional[str] = None
    indexed_at: Optional[datetime] = None
    document_count: int = 0
    source_type: str = Field(..., description="static or dynamic")
    last_updated: Optional[datetime] = None

class SourcesResponse(BaseModel):
    total_sources: int
    static_sources: List[SourceInfo]
    dynamic_sources: List[SourceInfo]
    timestamp: datetime = Field(default_factory=datetime.now)

# Evaluation models
class EvaluationRequest(BaseModel):
    test_cases: List[Dict[str, Any]]
    session_id: Optional[str] = None

class EvaluationResponse(BaseModel):
    overall_score: float
    detailed_results: List[Dict[str, Any]]
    metrics: Dict[str, float]
    timestamp: datetime = Field(default_factory=datetime.now)

print("✅ Pydantic models defined!")

# ========================================
# STEP 4: GLOBAL VARIABLES AND UTILITIES
# ========================================

# Global variables
embedding_model = None
static_vectorstore = None
dynamic_vectorstore = None
llm = None
bm25_static = None
bm25_dynamic = None
bm25_docs_static = None
bm25_docs_dynamic = None
bm25_sources_static = None
bm25_sources_dynamic = None
reranker = None
text_to_docs_static = None
text_to_docs_dynamic = None
chat_prompt = None
text_splitter = None

# In-memory conversation storage
active_conversations = {}

# Source URL mapping for static content
source_url_map = {
    "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
    "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
}

# Utility functions
def ensure_directories():
    """Ensure all required directories exist"""
    directories = [
        config.DYNAMIC_BASE_PATH,
        config.DYNAMIC_FAISS_PATH,
        config.METADATA_PATH,
        config.BACKUP_PATH,
        config.CONVERSATIONS_PATH,
        config.EVALUATION_DATASET_PATH
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    logger.info("✅ Directory structure created")

def get_url_hash(url: str) -> str:
    """Generate a hash for URL to use as unique identifier"""
    return hashlib.md5(url.encode()).hexdigest()

def load_metadata(file_path: str) -> Dict:
    """Load metadata from JSON file"""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading metadata from {file_path}: {str(e)}")
    return {}

def save_metadata(data: Dict, file_path: str):
    """Save metadata to JSON file"""
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, default=str)
    except Exception as e:
        logger.error(f"Error saving metadata to {file_path}: {str(e)}")
        raise

def format_sources(sources):
    """Convert file paths to URLs"""
    formatted = []
    for source in sources:
        if source.startswith('http'):
            formatted.append(source)
        else:
            formatted.append(source_url_map.get(source, source))
    return formatted

def generate_session_id() -> str:
    """Generate a unique session ID"""
    return str(uuid.uuid4())

def clean_old_conversations():
    """Remove old conversations from memory"""
    current_time = datetime.now()
    expired_sessions = []

    for session_id, conv_data in active_conversations.items():
        last_active = conv_data.get('last_active', current_time)
        if isinstance(last_active, str):
            last_active = datetime.fromisoformat(last_active)

        if (current_time - last_active).seconds > config.CONVERSATION_TIMEOUT:
            expired_sessions.append(session_id)

    for session_id in expired_sessions:
        del active_conversations[session_id]
        logger.info(f"Cleaned expired conversation: {session_id}")

print("✅ Utility functions defined!")

# ========================================
# STEP 5: CONVERSATION MANAGEMENT
# ========================================

class ConversationManager:
    def __init__(self):
        self.conversations = active_conversations

    def get_or_create_session(self, session_id: Optional[str] = None) -> str:
        """Get existing session or create new one"""
        if session_id and session_id in self.conversations:
            # Update last active time
            self.conversations[session_id]['last_active'] = datetime.now()
            return session_id

        # Create new session
        new_session_id = generate_session_id()
        self.conversations[new_session_id] = {
            'messages': [],
            'created_at': datetime.now(),
            'last_active': datetime.now(),
            'metadata': {}
        }
        logger.info(f"Created new conversation session: {new_session_id}")
        return new_session_id

    def add_message(self, session_id: str, message: ChatMessage):
        """Add message to conversation"""
        if session_id not in self.conversations:
            raise ValueError(f"Session {session_id} not found")

        self.conversations[session_id]['messages'].append({
            'role': message.role,
            'content': message.content,
            'timestamp': message.timestamp.isoformat() if message.timestamp else datetime.now().isoformat()
        })
        self.conversations[session_id]['last_active'] = datetime.now()

        # Trim conversation if too long
        messages = self.conversations[session_id]['messages']
        if len(messages) > config.MAX_CONVERSATION_LENGTH * 2:  # *2 for user+assistant pairs
            self.conversations[session_id]['messages'] = messages[-config.MAX_CONVERSATION_LENGTH * 2:]

    def get_conversation_context(self, session_id: str, max_pairs: int = None) -> str:
        """Get conversation context for LLM"""
        if session_id not in self.conversations:
            return ""

        messages = self.conversations[session_id]['messages']
        if not messages:
            return ""

        # Get last N pairs (user + assistant messages)
        max_pairs = max_pairs or config.MAX_CONTEXT_LENGTH
        recent_messages = messages[-(max_pairs * 2):]

        context_parts = []
        for msg in recent_messages:
            role = "Human" if msg['role'] == 'user' else "Assistant"
            context_parts.append(f"{role}: {msg['content']}")

        return "\n".join(context_parts)

    def get_conversation_length(self, session_id: str) -> int:
        """Get number of message exchanges"""
        if session_id not in self.conversations:
            return 0
        return len(self.conversations[session_id]['messages'])

    def save_conversation(self, session_id: str):
        """Save conversation to disk"""
        if session_id not in self.conversations:
            return

        conv_file = os.path.join(config.CONVERSATIONS_PATH, f"{session_id}.json")
        save_metadata(self.conversations[session_id], conv_file)

conversation_manager = ConversationManager()
print("✅ Conversation manager initialized!")

# ========================================
# STEP 6: WEB SCRAPING AND INDEXING
# ========================================

class WebScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def extract_content(self, url: str) -> Dict[str, Any]:
        """Extract content from URL with retry logic"""
        for attempt in range(config.MAX_RETRIES):
            try:
                response = self.session.get(url, timeout=config.REQUEST_TIMEOUT)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Remove unwanted elements
                for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
                    element.decompose()

                # Extract title
                title = soup.find('title')
                title_text = title.get_text().strip() if title else url

                # Extract main content
                content_selectors = [
                    'article', 'main', '[role="main"]',
                    '.content', '.post', '.article',
                    'div.container', 'div.wrapper'
                ]

                content_text = ""
                for selector in content_selectors:
                    content = soup.select_one(selector)
                    if content:
                        content_text = content.get_text()
                        break

                if not content_text:
                    # Fallback to body
                    body = soup.find('body')
                    content_text = body.get_text() if body else ""

                # Clean text
                content_text = re.sub(r'\s+', ' ', content_text).strip()

                if not content_text:
                    raise ValueError("No content extracted")

                return {
                    'title': title_text,
                    'content': content_text,
                    'url': url,
                    'success': True,
                    'error': None
                }

            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
                if attempt < config.MAX_RETRIES - 1:
                    time.sleep(config.RETRY_DELAY * (2 ** attempt))
                else:
                    return {
                        'title': None,
                        'content': None,
                        'url': url,
                        'success': False,
                        'error': str(e)
                    }

def build_bm25_from_vectorstore(vectorstore):
    """Build BM25 index from FAISS vectorstore"""
    try:
        docs = []
        sources = []
        for doc_id, doc in vectorstore.docstore._dict.items():
            text = getattr(doc, "page_content", None) or doc.page_content
            docs.append(text)
            sources.append(doc.metadata.get("source", "Unknown"))

        tokenized = [d.split() for d in docs]
        bm25 = BM25Okapi(tokenized)

        # Build text to docs mapping
        text_to_docs = {}
        for doc in vectorstore.docstore._dict.values():
            text = doc.page_content
            text_to_docs.setdefault(text, []).append(doc)

        logger.info(f"BM25 built over {len(docs)} chunks")
        return bm25, docs, sources, text_to_docs
    except Exception as e:
        logger.error(f"Error building BM25: {str(e)}")
        raise

print("✅ Web scraping and indexing functions defined!")

# ========================================
# STEP 7: HYBRID SEARCH SYSTEM
# ========================================

def hybrid_search(query: str, use_dynamic: bool = True, bm_k: int = 10, faiss_k: int = 10, top_k: int = 3, use_reranker: bool = True, conversation_context: str = ""):
    """Perform hybrid search on static and/or dynamic indices with conversation context"""
    try:
        # Enhanced query with context
        enhanced_query = query
        if conversation_context:
            # Add context to help with pronouns and references
            enhanced_query = f"Context: {conversation_context}\n\nCurrent question: {query}"

        all_candidates = []

        # Search static index
        if static_vectorstore:
            candidates_static = _search_single_index(
                enhanced_query, static_vectorstore, bm25_static, bm25_docs_static,
                bm25_sources_static, text_to_docs_static, bm_k, faiss_k
            )
            all_candidates.extend(candidates_static)

        # Search dynamic index
        if use_dynamic and dynamic_vectorstore:
            candidates_dynamic = _search_single_index(
                enhanced_query, dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic,
                bm25_sources_dynamic, text_to_docs_dynamic, bm_k, faiss_k
            )
            all_candidates.extend(candidates_dynamic)

        if not all_candidates:
            return []

        # Merge and deduplicate candidates
        merged = OrderedDict()
        for candidate in all_candidates:
            key = candidate["text"].strip()[:100]  # Use first 100 chars as key
            if key not in merged or (candidate.get("bm25_score") or 0) > (merged[key].get("bm25_score") or 0):
                merged[key] = candidate

        candidates = list(merged.values())

        # Optional reranking - use original query for reranking
        if use_reranker and reranker and candidates:
            pairs = [(query, c["text"]) for c in candidates]  # Use original query for reranking
            scores = reranker.predict(pairs)
            for c, s in zip(candidates, scores):
                c["rerank_score"] = float(s)
            candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
        else:
            candidates = sorted(candidates, key=lambda x: (0 if x["bm25_score"] is not None else 1, -(x["bm25_score"] or 0)))

        # Return top_k Document objects
        final_docs = []
        for c in candidates[:top_k]:
            text = c["text"]
            # Try to find in both mappings
            docs_list = text_to_docs_static.get(text, []) if text_to_docs_static else []
            if not docs_list and text_to_docs_dynamic:
                docs_list = text_to_docs_dynamic.get(text, [])

            if docs_list:
                final_docs.append(docs_list[0])
            else:
                final_docs.append(Document(page_content=text, metadata={"source": c.get("source", "Unknown")}))

        return final_docs

    except Exception as e:
        logger.error(f"Error in hybrid search: {str(e)}")
        raise

def _search_single_index(query: str, vectorstore, bm25, bm25_docs, bm25_sources, text_to_docs, bm_k: int, faiss_k: int):
    """Search a single index (static or dynamic)"""
    candidates = []

    # BM25 candidates
    if bm25 and bm25_docs:
        tokenized_q = query.split()
        bm25_scores = bm25.get_scores(tokenized_q)
        bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:bm_k]
        for idx in bm25_indices:
            txt = bm25_docs[idx]
            src = bm25_sources[idx]
            candidates.append({"text": txt, "source": src, "bm25_score": bm25_scores[idx]})

    # FAISS candidates
    if vectorstore:
        faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": faiss_k})
        faiss_docs = faiss_retriever.get_relevant_documents(query)
        for d in faiss_docs:
            candidates.append({"text": d.page_content, "source": d.metadata.get("source", "Unknown"), "bm25_score": None})

    return candidates

print("✅ Hybrid search system defined!")

# ========================================
# STEP 8: EVALUATION SYSTEM
# ========================================

class RAGEvaluator:
    def __init__(self):
        self.test_cases = []
        self.results = []

    def create_test_dataset(self):
        """Create evaluation test cases"""
        test_cases = [
            # Context retention tests
            {
                "conversation": [
                    {"role": "user", "content": "What is attention mechanism in transformers?"},
                    {"role": "user", "content": "How does it help with long sequences?"}
                ],
                "expected_context": "attention mechanism",
                "test_type": "context_retention"
            },

            # Citation accuracy tests
            {
                "conversation": [
                    {"role": "user", "content": "Tell me about RAG systems"}
                ],
                "expected_citations": True,
                "test_type": "citation_accuracy"
            },

            # Topic switching tests
            {
                "conversation": [
                    {"role": "user", "content": "Explain neural networks"},
                    {"role": "user", "content": "Now tell me about cooking recipes"}
                ],
                "expected_behavior": "topic_switch",
                "test_type": "topic_switching"
            },

            # Multi-source tests
            {
                "conversation": [
                    {"role": "user", "content": "What are the latest trends in AI?"}
                ],
                "expected_sources": "mixed",
                "test_type": "multi_source"
            }
        ]

        # Save test cases
        test_file = os.path.join(config.EVALUATION_DATASET_PATH, "test_cases.json")
        save_metadata(test_cases, test_file)
        return test_cases

    def evaluate_response_relevance(self, question: str, answer: str, context_docs: List[Document]) -> float:
        """Evaluate how relevant the answer is to the question"""
        try:
            # Simple keyword overlap score
            question_words = set(question.lower().split())
            answer_words = set(answer.lower().split())

            overlap = len(question_words.intersection(answer_words))
            relevance_score = overlap / len(question_words) if question_words else 0.0

            # Bonus for using context
            context_text = " ".join([doc.page_content for doc in context_docs])
            context_words = set(context_text.lower().split())
            context_usage = len(answer_words.intersection(context_words)) / len(context_words) if context_words else 0.0

            final_score = min(1.0, (relevance_score + context_usage) / 2)
            return final_score

        except Exception as e:
            logger.error(f"Error evaluating relevance: {str(e)}")
            return 0.0

    def evaluate_citation_accuracy(self, answer: str, sources: List[str]) -> float:
        """Evaluate citation accuracy"""
        if not sources:
            return 0.0

        # Check if answer contains factual claims (simple heuristic)
        factual_indicators = ["according to", "research shows", "studies indicate", "data reveals", "analysis found"]
        has_factual_claims = any(indicator in answer.lower() for indicator in factual_indicators)

        if has_factual_claims and sources:
            return 1.0
        elif not has_factual_claims:
            return 0.8  # No factual claims, so no citations needed
        else:
            return 0.0  # Factual claims but no citations

    def evaluate_context_retention(self, conversation_history: List[Dict], current_answer: str) -> float:
        """Evaluate how well context from previous messages is retained"""
        if len(conversation_history) <= 1:
            return 1.0  # No previous context to retain

        # Look for references to previous topics
        previous_content = " ".join([msg["content"] for msg in conversation_history[:-1]])
        previous_words = set(previous_content.lower().split())
        answer_words = set(current_answer.lower().split())

        # Check for pronouns and references
        references = ["this", "that", "it", "they", "these", "those"]
        has_references = any(ref in current_answer.lower() for ref in references)

        # Calculate context retention score
        word_overlap = len(previous_words.intersection(answer_words)) / len(previous_words) if previous_words else 0.0
        reference_bonus = 0.3 if has_references else 0.0

        context_score = min(1.0, word_overlap + reference_bonus)
        return context_score

    async def run_evaluation(self, test_cases: List[Dict]) -> Dict[str, Any]:
        """Run comprehensive evaluation"""
        results = []
        total_scores = {"relevance": [], "citation": [], "context": []}

        for i, test_case in enumerate(test_cases):
            try:
                logger.info(f"Running test case {i+1}/{len(test_cases)}")

                # Simulate conversation
                session_id = generate_session_id()
                conversation_history = []

                for message in test_case["conversation"]:
                    # Add user message
                    user_msg = ChatMessage(role="user", content=message["content"])
                    conversation_manager.add_message(session_id, user_msg)
                    conversation_history.append({"role": "user", "content": message["content"]})

                    # Get bot response
                    context = conversation_manager.get_conversation_context(session_id)
                    docs = hybrid_search(
                        message["content"],
                        use_dynamic=True,
                        conversation_context=context
                    )

                    if docs:
                        doc_context = "\n\n".join([doc.page_content for doc in docs])
                        formatted_prompt = chat_prompt.format(
                            conversation_context=context,
                            context=doc_context,
                            question=message["content"]
                        )
                        response = llm.invoke(formatted_prompt)
                        answer = response.content.strip()
                        sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
                        sources = format_sources(sources)
                    else:
                        answer = "I couldn't find specific information about that in my knowledge base."
                        sources = []

                    # Add assistant message
                    assistant_message = ChatMessage(role="assistant", content=answer)
                    conversation_manager.add_message(session_id, assistant_message)
                    conversation_history.append({"role": "assistant", "content": answer})

                # Evaluate the last response
                last_question = test_case["conversation"][-1]["content"]
                last_answer = conversation_history[-1]["content"]

                # Calculate scores
                relevance_score = self.evaluate_response_relevance(last_question, last_answer, docs if docs else [])
                citation_score = self.evaluate_citation_accuracy(last_answer, sources if sources else [])
                context_score = self.evaluate_context_retention(conversation_history[:-1], last_answer)

                result = {
                    "test_case_id": i,
                    "test_type": test_case.get("test_type", "general"),
                    "question": last_question,
                    "answer": last_answer,
                    "sources": sources if sources else [],
                    "scores": {
                        "relevance": relevance_score,
                        "citation": citation_score,
                        "context_retention": context_score
                    },
                    "overall_score": (relevance_score + citation_score + context_score) / 3
                }

                results.append(result)
                total_scores["relevance"].append(relevance_score)
                total_scores["citation"].append(citation_score)
                total_scores["context"].append(context_score)

            except Exception as e:
                logger.error(f"Error in test case {i}: {str(e)}")
                continue

        # Calculate overall metrics
        metrics = {
            "average_relevance": np.mean(total_scores["relevance"]) if total_scores["relevance"] else 0.0,
            "average_citation": np.mean(total_scores["citation"]) if total_scores["citation"] else 0.0,
            "average_context": np.mean(total_scores["context"]) if total_scores["context"] else 0.0,
            "overall_average": np.mean([np.mean(scores) for scores in total_scores.values()]) if any(total_scores.values()) else 0.0,
            "test_cases_completed": len(results),
            "test_cases_failed": len(test_cases) - len(results)
        }

        return {
            "overall_score": metrics["overall_average"],
            "detailed_results": results,
            "metrics": metrics
        }

evaluator = RAGEvaluator()
print("✅ Evaluation system initialized!")

# ========================================
# STEP 9: MODEL INITIALIZATION
# ========================================

def initialize_models():
    """Initialize all models and components"""
    global embedding_model, static_vectorstore, dynamic_vectorstore, llm
    global bm25_static, bm25_dynamic, bm25_docs_static, bm25_docs_dynamic
    global bm25_sources_static, bm25_sources_dynamic, reranker
    global text_to_docs_static, text_to_docs_dynamic, chat_prompt, text_splitter

    try:
        print("🔄 Loading models...")
        ensure_directories()
        clean_old_conversations()

        # Load embedding model
        embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        print("✅ Embedding model loaded")

        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        print("✅ Text splitter initialized")

        # Load static FAISS vectorstore
        try:
            static_vectorstore = FAISS.load_local(
                config.STATIC_FAISS_PATH,
                embedding_model,
                allow_dangerous_deserialization=True
            )
            bm25_static, bm25_docs_static, bm25_sources_static, text_to_docs_static = build_bm25_from_vectorstore(static_vectorstore)
            print("✅ Static FAISS vectorstore loaded")
        except Exception as e:
            print(f"⚠️ Static FAISS not found: {str(e)}")

        # Load dynamic FAISS vectorstore (if exists)
        try:
            dynamic_vectorstore = FAISS.load_local(
                config.DYNAMIC_FAISS_PATH,
                embedding_model,
                allow_dangerous_deserialization=True
            )
            bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)
            print("✅ Dynamic FAISS vectorstore loaded")
        except Exception as e:
            print(f"ℹ️ Dynamic FAISS not found (will create on first index): {str(e)}")

        # Initialize LLM
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            google_api_key=os.environ["GOOGLE_API_KEY"]
        )
        print("✅ LLM initialized")

        # Create chat prompt template
        chat_prompt_template = """
You are a helpful AI assistant with access to multiple knowledge sources. You can maintain context across conversations and provide accurate citations.

Previous conversation context:
{conversation_context}

Current context from knowledge base:
{context}

Current question:
{question}

Instructions:
1. Use the conversation context to understand references like "it", "this", "that", etc.
2. Provide accurate answers based on the knowledge base context
3. If you reference specific information, it should be from the provided context
4. If the answer is not in the knowledge base, say: "I couldn't find specific information about that in my knowledge base."
5. Be conversational and natural in your responses
6. Handle follow-up questions by connecting them to previous context when appropriate

Answer:
        """
        chat_prompt = PromptTemplate(
            input_variables=["conversation_context", "context", "question"],
            template=chat_prompt_template
        )
        print("✅ Chat prompt template created")

        # Load reranker
        if config.USE_RERANKER:
            reranker = CrossEncoder(config.RERANKER_MODEL)
            print("✅ Reranker loaded")

        print("🎉 All models initialized successfully!")

    except Exception as e:
        print(f"❌ Error initializing models: {str(e)}")
        raise

# ========================================
# STEP 10: FASTAPI APPLICATION
# ========================================

app = FastAPI(
    title="Conversational RAG API with Dynamic Indexing and Sources",
    description="Hybrid Retrieval-Augmented Generation API with Conversation Support, BM25 + FAISS, Dynamic Web Indexing, and Source Management",
    version="3.1.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

security = HTTPBearer()

# Authentication
async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    """Verify API key from Authorization header"""
    api_key = credentials.credentials
    if api_key not in config.VALID_API_KEYS:
        logger.warning(f"Invalid API key attempted: {api_key[:10]}...")
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid API key",
            headers={"WWW-Authenticate": "Bearer"},
        )
    return config.VALID_API_KEYS[api_key]

# ========================================
# STEP 11: API ENDPOINTS
# ========================================

@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    clean_old_conversations()  # Cleanup old conversations on health check

    components = {
        "static_vectorstore": static_vectorstore is not None,
        "dynamic_vectorstore": dynamic_vectorstore is not None,
        "llm": llm is not None,
        "bm25_static": bm25_static is not None,
        "bm25_dynamic": bm25_dynamic is not None,
        "reranker": reranker is not None if config.USE_RERANKER else "disabled",
        "conversation_manager": True
    }
    status = "healthy" if llm is not None else "unhealthy"
    return HealthResponse(
        status=status,
        components=components,
        conversations_active=len(active_conversations)
    )

@app.post("/api/v1/chat", response_model=ChatResponse)
async def chat_with_rag(
    request: ChatRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Main conversational RAG endpoint"""
    try:
        # Check permissions
        if "chat" not in user_info.get("permissions", []):
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Insufficient permissions for chat"
            )

        logger.info(f"Chat request from user {user_info['user']}")

        # Get or create session
        session_id = conversation_manager.get_or_create_session(request.session_id)

        # Add conversation history to session (except the last message which is the current question)
        for message in request.messages[:-1]:
            conversation_manager.add_message(session_id, message)

        # Get current question
        current_message = request.messages[-1]
        if current_message.role != "user":
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Last message must be from user"
            )

        # Add current question to conversation
        conversation_manager.add_message(session_id, current_message)

        # Get conversation context for retrieval and generation
        conversation_context = conversation_manager.get_conversation_context(session_id)

        # Perform hybrid search with conversation context
        docs = hybrid_search(
            current_message.content,
            use_dynamic=request.use_dynamic_index,
            bm_k=config.BM25_TOP_K,
            faiss_k=config.FAISS_TOP_K,
            top_k=request.top_k,
            use_reranker=request.use_reranker and config.USE_RERANKER,
            conversation_context=conversation_context
        )

        if not docs:
            answer = "I couldn't find specific information about that in my knowledge base."
            sources = []
        else:
            # Create context and query LLM
            doc_context = "\n\n".join([doc.page_content for doc in docs])
            formatted_prompt = chat_prompt.format(
                conversation_context=conversation_context,
                context=doc_context,
                question=current_message.content
            )

            response = llm.invoke(formatted_prompt)
            answer = response.content.strip()

            # Get unique sources
            sources = list(OrderedDict.fromkeys(doc.metadata.get("source", "Unknown") for doc in docs))
            sources = format_sources(sources)

        # Add assistant response to conversation
        assistant_message = ChatMessage(role="assistant", content=answer)
        conversation_manager.add_message(session_id, assistant_message)

        # Save conversation periodically
        if conversation_manager.get_conversation_length(session_id) % 4 == 0:  # Every 4 messages
            conversation_manager.save_conversation(session_id)

        # UPDATED RESPONSE STRUCTURE WITH ENHANCED SOURCES
        response_data = {
            "answer": {
                "content": answer,
                "role": "assistant"
            },
            "sources": sources,  # Make sources more prominent
            "citations": sources,  # Keep backward compatibility
            "retrieved_documents": [
                {
                    "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
                    "source": doc.metadata.get("source", "Unknown"),
                    "title": doc.metadata.get("title", ""),
                    "chunk_id": doc.metadata.get("chunk_id", 0)
                } for doc in docs
            ] if docs else [],
            "metadata": {
                "num_docs_retrieved": len(docs),
                "num_sources": len(sources),
                "reranker_used": request.use_reranker and config.USE_RERANKER,
                "dynamic_index_used": request.use_dynamic_index,
                "conversation_length": conversation_manager.get_conversation_length(session_id),
                "user": user_info["user"]
            }
        }

        return ChatResponse(
            session_id=session_id,
            response=response_data,
            conversation_length=conversation_manager.get_conversation_length(session_id)
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing chat: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}"
        )

# NEW: Sources management endpoints
@app.get("/api/v1/sources", response_model=SourcesResponse)
async def get_sources(user_info: dict = Depends(verify_api_key)):
    """Get all available sources in the system"""
    try:
        # Check permissions
        if "read" not in user_info.get("permissions", []):
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Insufficient permissions to view sources"
            )

        static_sources = []
        dynamic_sources = []

        # Get static sources
        if static_vectorstore:
            static_source_map = {}
            for doc_id, doc in static_vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "Unknown")
                title = doc.metadata.get("title", "")

                if source not in static_source_map:
                    static_source_map[source] = {
                        "count": 0,
                        "title": title
                    }
                static_source_map[source]["count"] += 1

            for source, info in static_source_map.items():
                # Convert file paths to URLs using source_url_map
                display_source = source_url_map.get(source, source)
                static_sources.append(SourceInfo(
                    source_url=display_source,
                    title=info["title"],
                    document_count=info["count"],
                    source_type="static",
                    indexed_at=None,  # Static sources don't have index timestamp
                    last_updated=None
                ))

        # Get dynamic sources
        if dynamic_vectorstore:
            dynamic_source_map = {}
            for doc_id, doc in dynamic_vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "Unknown")
                title = doc.metadata.get("title", "")
                indexed_at = doc.metadata.get("indexed_at")

                if source not in dynamic_source_map:
                    dynamic_source_map[source] = {
                        "count": 0,
                        "title": title,
                        "indexed_at": indexed_at
                    }
                dynamic_source_map[source]["count"] += 1

                # Keep the most recent indexed_at
                if indexed_at and (not dynamic_source_map[source]["indexed_at"] or indexed_at > dynamic_source_map[source]["indexed_at"]):
                    dynamic_source_map[source]["indexed_at"] = indexed_at

            for source, info in dynamic_source_map.items():
                indexed_datetime = None
                if info["indexed_at"]:
                    try:
                        indexed_datetime = datetime.fromisoformat(info["indexed_at"]) if isinstance(info["indexed_at"], str) else info["indexed_at"]
                    except:
                        indexed_datetime = None

                dynamic_sources.append(SourceInfo(
                    source_url=source,
                    title=info["title"],
                    document_count=info["count"],
                    source_type="dynamic",
                    indexed_at=indexed_datetime,
                    last_updated=indexed_datetime
                ))

        # Sort sources by document count (descending)
        static_sources.sort(key=lambda x: x.document_count, reverse=True)
        dynamic_sources.sort(key=lambda x: x.document_count, reverse=True)

        return SourcesResponse(
            total_sources=len(static_sources) + len(dynamic_sources),
            static_sources=static_sources,
            dynamic_sources=dynamic_sources
        )

    except Exception as e:
        logger.error(f"Error retrieving sources: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to retrieve sources: {str(e)}"
        )

@app.get("/api/v1/sources/{source_hash}")
async def get_source_details(
    source_hash: str,
    user_info: dict = Depends(verify_api_key)
):
    """Get detailed information about a specific source"""
    # Check permissions
    if "read" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions to view source details"
        )

    # Find source by hash or URL
    source_found = False
    source_details = {
        "source_url": None,
        "title": None,
        "chunks": [],
        "source_type": None,
        "indexed_at": None
    }

    # Search in both vectorstores
    for vectorstore, source_type in [(static_vectorstore, "static"), (dynamic_vectorstore, "dynamic")]:
        if vectorstore:
            for doc_id, doc in vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "")
                url_hash = get_url_hash(source)

                if url_hash == source_hash or source.endswith(source_hash):
                    source_found = True
                    source_details.update({
                        "source_url": source_url_map.get(source, source),
                        "title": doc.metadata.get("title", ""),
                        "source_type": source_type,
                        "indexed_at": doc.metadata.get("indexed_at")
                    })

                    source_details["chunks"].append({
                        "chunk_id": doc.metadata.get("chunk_id", 0),
                        "content_preview": doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content,
                        "content_length": len(doc.page_content)
                    })

    if not source_found:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Source not found"
        )

    return source_details

@app.post("/api/v1/index", response_model=IndexResponse)
async def index_urls(
    request: IndexRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Index URLs into the dynamic vector database"""
    global dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic

    # Check permissions
    if "index" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for indexing"
        )

    logger.info(f"Indexing request from {user_info['user']}: {len(request.url)} URLs")

    scraper = WebScraper()
    indexed_urls = []
    failed_urls = []

    try:
        new_documents = []

        for url in request.url:
            try:
                logger.info(f"Processing URL: {url}")

                # Extract content
                result = scraper.extract_content(url)

                if not result['success']:
                    failed_urls.append({
                        "url": url,
                        "error": result['error'],
                        "error_type": "EXTRACTION_FAILED"
                    })
                    continue

                # Split content into chunks
                chunks = text_splitter.split_text(result['content'])

                # Create documents
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={
                            "source": url,
                            "title": result['title'],
                            "chunk_id": i,
                            "total_chunks": len(chunks),
                            "indexed_at": datetime.now().isoformat(),
                            "url_hash": get_url_hash(url)
                        }
                    )
                    new_documents.append(doc)

                indexed_urls.append(url)
                logger.info(f"✅ Successfully processed {url} - {len(chunks)} chunks")

            except Exception as e:
                logger.error(f"Error processing {url}: {str(e)}")
                failed_urls.append({
                    "url": url,
                    "error": str(e),
                    "error_type": "PROCESSING_ERROR"
                })

        # Update vector database if we have new documents
        if new_documents:
            try:
                if dynamic_vectorstore is None:
                    # Create new FAISS index
                    dynamic_vectorstore = FAISS.from_documents(new_documents, embedding_model)
                    logger.info("✅ Created new dynamic FAISS index")
                else:
                    # Add to existing index
                    dynamic_vectorstore.add_documents(new_documents)
                    logger.info(f"✅ Added {len(new_documents)} documents to existing index")

                # Save updated index
                dynamic_vectorstore.save_local(config.DYNAMIC_FAISS_PATH)

                # Rebuild BM25 and mappings
                bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)

                logger.info(f"✅ Dynamic index updated")

            except Exception as e:
                logger.error(f"Error updating vector database: {str(e)}")
                raise HTTPException(
                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                    detail=f"Failed to update vector database: {str(e)}"
                )

        # Prepare response
        response_status = "success" if indexed_urls else "failed"
        if indexed_urls and failed_urls:
            response_status = "partial_success"

        metadata = {
            "total_requested": len(request.url),
            "successfully_indexed": len(indexed_urls),
            "failed": len(failed_urls),
            "new_documents_added": len(new_documents),
            "user": user_info["user"]
        }

        return IndexResponse(
            status=response_status,
            indexed_url=indexed_urls,
            failed_url=failed_urls if failed_urls else None,
            metadata=metadata
        )

    except Exception as e:
        logger.error(f"Critical error in indexing: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Indexing failed: {str(e)}"
        )

@app.post("/api/v1/evaluate", response_model=EvaluationResponse)
async def evaluate_system(
    request: EvaluationRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Run automated evaluation of the RAG system"""
    # Check permissions
    if "eval" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for evaluation"
        )

    logger.info(f"Evaluation request from user {user_info['user']}")

    try:
        # Use provided test cases or create default ones
        test_cases = request.test_cases if request.test_cases else evaluator.create_test_dataset()

        # Run evaluation
        evaluation_results = await evaluator.run_evaluation(test_cases)

        return EvaluationResponse(
            overall_score=evaluation_results["overall_score"],
            detailed_results=evaluation_results["detailed_results"],
            metrics=evaluation_results["metrics"]
        )

    except Exception as e:
        logger.error(f"Error running evaluation: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Evaluation failed: {str(e)}"
        )

@app.get("/api/v1/conversations/{session_id}")
async def get_conversation(
    session_id: str,
    user_info: dict = Depends(verify_api_key)
):
    """Get conversation history"""
    if session_id not in active_conversations:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Conversation not found"
        )

    conversation_data = active_conversations[session_id]
    return {
        "session_id": session_id,
        "messages": conversation_data["messages"],
        "created_at": conversation_data["created_at"],
        "last_active": conversation_data["last_active"],
        "message_count": len(conversation_data["messages"])
    }

@app.delete("/api/v1/conversations/{session_id}")
async def delete_conversation(
    session_id: str,
    user_info: dict = Depends(verify_api_key)
):
    """Delete a conversation"""
    if session_id not in active_conversations:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Conversation not found"
        )

    del active_conversations[session_id]
    return {"status": "deleted", "session_id": session_id}

@app.get("/api/v1/conversations")
async def list_conversations(user_info: dict = Depends(verify_api_key)):
    """List active conversations"""
    clean_old_conversations()

    conversations_summary = []
    for session_id, conv_data in active_conversations.items():
        conversations_summary.append({
            "session_id": session_id,
            "message_count": len(conv_data["messages"]),
            "created_at": conv_data["created_at"],
            "last_active": conv_data["last_active"],
            "last_message_preview": conv_data["messages"][-1]["content"][:100] if conv_data["messages"] else ""
        })

    return {
        "active_conversations": len(conversations_summary),
        "conversations": conversations_summary
    }

print("✅ API endpoints defined!")

# ========================================
# STEP 12: DEMO AND TESTING FUNCTIONS
# ========================================

def create_demo_test_cases():
    """Create demonstration test cases for your project"""
    demo_cases = [
        {
            "name": "Context Retention Test",
            "conversation": [
                {"role": "user", "content": "What is a transformer in machine learning?"},
                {"role": "user", "content": "How does its attention mechanism work?"},
                {"role": "user", "content": "What are the advantages of this approach?"}
            ],
            "expected_behavior": "Should understand that 'its' refers to transformer and 'this approach' refers to attention mechanism"
        },
        {
            "name": "Multi-Source Retrieval Test",
            "conversation": [
                {"role": "user", "content": "Tell me about recent developments in AI and their impact on software engineering"}
            ],
            "expected_behavior": "Should pull from both static and dynamic sources, provide citations"
        },
        {
            "name": "Topic Switching Test",
            "conversation": [
                {"role": "user", "content": "Explain neural networks"},
                {"role": "user", "content": "Actually, let's talk about database indexing instead"}
            ],
            "expected_behavior": "Should cleanly switch topics without carrying over irrelevant context"
        },
        {
            "name": "Citation Accuracy Test",
            "conversation": [
                {"role": "user", "content": "What does research say about hallucinations in language models?"}
            ],
            "expected_behavior": "Should provide specific citations for research claims"
        }
    ]

    # Save demo test cases
    demo_file = os.path.join(config.EVALUATION_DATASET_PATH, "demo_test_cases.json")
    save_metadata(demo_cases, demo_file)

    print("✅ Demo test cases created!")
    print("📁 Location:", demo_file)
    return demo_cases

# ========================================
# STEP 13: SERVER STARTUP
# ========================================

def start_server():
    """Start the FastAPI server with ngrok"""
    # Create ngrok tunnel
    public_url = ngrok.connect(8000)

    print("=" * 60)
    print("🚀 CONVERSATIONAL RAG SYSTEM WITH SOURCES API LAUNCHED!")
    print("=" * 60)
    print(f"🌐 Public URL: {public_url}")
    print(f"📚 API Documentation: {public_url}/docs")
    print(f"🔑 API Keys:")
    print(f"   • demo-api-key-123 (full access)")
    print(f"   • eval-key-456 (evaluation access)")
    print("")
    print("🎯 MAIN ENDPOINTS:")
    print(f"  • POST {public_url}/api/v1/chat - Chat with the system")
    print(f"  • POST {public_url}/api/v1/index - Index new URLs")
    print(f"  • GET  {public_url}/api/v1/sources - List all sources")
    print(f"  • GET  {public_url}/api/v1/sources/{{hash}} - Get source details")
    print(f"  • POST {public_url}/api/v1/evaluate - Run automated evaluation")
    print(f"  • GET  {public_url}/api/v1/conversations - List conversations")
    print(f"  • GET  {public_url}/health - Health check")
    print("")
    print("💡 EXAMPLE USAGE:")
    print(f"""
# Start a conversation:
curl -X POST "{public_url}/api/v1/chat" \\
  -H "Authorization: Bearer demo-api-key-123" \\
  -H "Content-Type: application/json" \\
  -d '{{
    "messages": [
      {{"role": "user", "content": "What is attention mechanism?"}}
    ]
  }}'

# Get all sources:
curl -X GET "{public_url}/api/v1/sources" \\
  -H "Authorization: Bearer demo-api-key-123"

# Index new content:
curl -X POST "{public_url}/api/v1/index" \\
  -H "Authorization: Bearer demo-api-key-123" \\
  -H "Content-Type: application/json" \\
  -d '{{"url": ["https://example.com/ai-article"]}}'

# Run evaluation:
curl -X POST "{public_url}/api/v1/evaluate" \\
  -H "Authorization: Bearer eval-key-456" \\
  -H "Content-Type: application/json" \\
  -d '{{"test_cases": []}}'
    """)
    print("=" * 60)
    print("🔥 Your conversational RAG system with sources API is ready!")
    print("✨ New Features: Complete source management and tracking")
    print("🎓 Perfect for your final data science project!")
    print("=" * 60)

    # Start the server
    uvicorn.run(app, host="0.0.0.0", port=8000)

# ========================================
# STEP 14: LAUNCH THE SYSTEM
# ========================================

# Initialize models first
print("🔄 Initializing models and components...")
initialize_models()

# Create demo test cases for your project
print("\n📝 Creating demo test cases...")
create_demo_test_cases()


# Launch the server
print("\n🚀 Starting the conversational RAG system...")
print("⏳ This will open ngrok tunnel and start the FastAPI server...")

# Start the server
start_server()

## AWS Deployment RAG pipeline

In [None]:
# Setup imports and basic configuration
import os
import json
import time
import shutil
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any, Union
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import uvicorn
from fastapi import FastAPI, HTTPException, Depends, status, Body
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field, validator
import logging
from collections import OrderedDict
import hashlib
import re
import uuid
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from threading import Thread

# Import the RAG components
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✅ All imports successful!")

# ========================================
# STEP 2: CONFIGURATION AND API KEYS
# ========================================

# Configuration for AWS deployment
class Config:
    # Base path for AWS EC2
    BASE_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo"

    # Static index (existing)
    STATIC_FAISS_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/faiss_index"

    # Dynamic index (new)
    DYNAMIC_BASE_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/dynamic_index"
    DYNAMIC_FAISS_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/dynamic_index/faiss_index"
    METADATA_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/dynamic_index/metadata"
    BACKUP_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/dynamic_index/backups"

    # Conversation settings
    CONVERSATIONS_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/conversations"
    MAX_CONVERSATION_LENGTH = 10  # Maximum number of message pairs
    CONVERSATION_TIMEOUT = 1800   # 30 minutes in seconds
    MAX_CONTEXT_LENGTH = 3        # Last N message pairs to include in context

    # Indexing settings
    USE_RERANKER = True
    RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    BM25_TOP_K = 10
    FAISS_TOP_K = 10
    FINAL_TOP_K = 3

    # Web scraping settings
    REQUEST_TIMEOUT = 30
    MAX_RETRIES = 3
    RETRY_DELAY = 2
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 100
    RE_INDEX_DAYS = 7
    MAX_VERSIONS = 3

    # Evaluation settings
    EVALUATION_DATASET_PATH = "/home/ubuntu/rag-knowledge-base/rag_demo/evaluation"

    # Authentication - YOU CAN CHANGE THESE KEYS
    VALID_API_KEYS = {
        "demo-api-key-123": {"user": "demo_user", "permissions": ["read", "query", "index", "chat"]},
        "eval-key-456": {"user": "evaluator", "permissions": ["read", "query", "chat", "eval"]},
    }

config = Config()

# Get API keys from environment variables
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    logger.error("GOOGLE_API_KEY environment variable not set!")
    raise ValueError("Please set GOOGLE_API_KEY environment variable")

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
print("✅ Configuration loaded!")

# ========================================
# STEP 3: PYDANTIC MODELS (Same as before)
# ========================================

# Chat-specific models
class ChatMessage(BaseModel):
    role: str = Field(..., pattern="^(user|assistant)$")
    content: str = Field(..., min_length=1, max_length=2000)
    timestamp: Optional[datetime] = Field(default_factory=datetime.now)

class ChatRequest(BaseModel):
    messages: List[ChatMessage] = Field(..., min_items=1, max_items=20)
    session_id: Optional[str] = Field(default=None)
    use_dynamic_index: Optional[bool] = Field(default=True)
    use_reranker: Optional[bool] = Field(default=True)
    top_k: Optional[int] = Field(default=3, ge=1, le=5)

class ChatResponse(BaseModel):
    session_id: str
    response: Dict[str, Any]
    conversation_length: int
    timestamp: datetime = Field(default_factory=datetime.now)

# Original models (updated)
class IndexRequest(BaseModel):
    url: List[str] = Field(..., min_items=1, max_items=5, description="URLs to index")

    @validator('url')
    def validate_urls(cls, v):
        for url in v:
            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                raise ValueError(f"Invalid URL format: {url}")
        return v

class IndexResponse(BaseModel):
    status: str
    indexed_url: List[str]
    failed_url: Optional[List[Dict[str, str]]] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
    timestamp: datetime = Field(default_factory=datetime.now)

class HealthResponse(BaseModel):
    status: str
    timestamp: datetime = Field(default_factory=datetime.now)
    components: dict
    conversations_active: int

# Sources models
class SourceInfo(BaseModel):
    source_url: str
    title: Optional[str] = None
    indexed_at: Optional[datetime] = None
    document_count: int = 0
    source_type: str = Field(..., description="static or dynamic")
    last_updated: Optional[datetime] = None

class SourcesResponse(BaseModel):
    total_sources: int
    static_sources: List[SourceInfo]
    dynamic_sources: List[SourceInfo]
    timestamp: datetime = Field(default_factory=datetime.now)

# Evaluation models
class EvaluationRequest(BaseModel):
    test_cases: List[Dict[str, Any]]
    session_id: Optional[str] = None

class EvaluationResponse(BaseModel):
    overall_score: float
    detailed_results: List[Dict[str, Any]]
    metrics: Dict[str, float]
    timestamp: datetime = Field(default_factory=datetime.now)

print("✅ Pydantic models defined!")

# ========================================
# STEP 4: GLOBAL VARIABLES AND UTILITIES
# ========================================

# Global variables
embedding_model = None
static_vectorstore = None
dynamic_vectorstore = None
llm = None
bm25_static = None
bm25_dynamic = None
bm25_docs_static = None
bm25_docs_dynamic = None
bm25_sources_static = None
bm25_sources_dynamic = None
reranker = None
text_to_docs_static = None
text_to_docs_dynamic = None
chat_prompt = None
text_splitter = None

# In-memory conversation storage (will use file-based backup)
active_conversations = {}

# Updated source URL mapping for static content (AWS paths)
source_url_map = {
    "/content/drive/MyDrive/RAG_demo/data/genai-platform.txt": "https://huyenchip.com/2024/07/25/genai-platform.html",
    "/content/drive/MyDrive/RAG_demo/data/hallucination.txt": "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "/content/drive/MyDrive/RAG_demo/data/quora_engineering.txt": "https://quoraengineering.quora.com/Building-Embedding-Search-at-Quora"
}

# Utility functions
def ensure_directories():
    """Ensure all required directories exist"""
    directories = [
        config.BASE_PATH,
        config.DYNAMIC_BASE_PATH,
        config.DYNAMIC_FAISS_PATH,
        config.METADATA_PATH,
        config.BACKUP_PATH,
        config.CONVERSATIONS_PATH,
        config.EVALUATION_DATASET_PATH
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
    logger.info("✅ Directory structure created")

def get_url_hash(url: str) -> str:
    """Generate a hash for URL to use as unique identifier"""
    return hashlib.md5(url.encode()).hexdigest()

def load_metadata(file_path: str) -> Dict:
    """Load metadata from JSON file"""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading metadata from {file_path}: {str(e)}")
    return {}

def save_metadata(data: Dict, file_path: str):
    """Save metadata to JSON file"""
    try:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, default=str)
    except Exception as e:
        logger.error(f"Error saving metadata to {file_path}: {str(e)}")
        raise

def format_sources(sources):
    """Convert file paths to URLs"""
    formatted = []
    for source in sources:
        if source.startswith('http'):
            formatted.append(source)
        else:
            formatted.append(source_url_map.get(source, source))
    return formatted

def generate_session_id() -> str:
    """Generate a unique session ID"""
    return str(uuid.uuid4())

def clean_old_conversations():
    """Remove old conversations from memory and save important ones"""
    current_time = datetime.now()
    expired_sessions = []

    for session_id, conv_data in active_conversations.items():
        last_active = conv_data.get('last_active', current_time)
        if isinstance(last_active, str):
            last_active = datetime.fromisoformat(last_active)

        if (current_time - last_active).seconds > config.CONVERSATION_TIMEOUT:
            # Save conversation before deletion
            try:
                conv_file = os.path.join(config.CONVERSATIONS_PATH, f"{session_id}.json")
                save_metadata(conv_data, conv_file)
            except Exception as e:
                logger.error(f"Error saving conversation {session_id}: {e}")

            expired_sessions.append(session_id)

    for session_id in expired_sessions:
        del active_conversations[session_id]
        logger.info(f"Cleaned expired conversation: {session_id}")

print("✅ Utility functions defined!")

# ========================================
# STEP 5: CONVERSATION MANAGEMENT
# ========================================

class ConversationManager:
    def __init__(self):
        self.conversations = active_conversations

    def get_or_create_session(self, session_id: Optional[str] = None) -> str:
        """Get existing session or create new one"""
        if session_id and session_id in self.conversations:
            # Update last active time
            self.conversations[session_id]['last_active'] = datetime.now()
            return session_id

        # Create new session
        new_session_id = generate_session_id()
        self.conversations[new_session_id] = {
            'messages': [],
            'created_at': datetime.now(),
            'last_active': datetime.now(),
            'metadata': {}
        }
        logger.info(f"Created new conversation session: {new_session_id}")
        return new_session_id

    def add_message(self, session_id: str, message: ChatMessage):
        """Add message to conversation"""
        if session_id not in self.conversations:
            raise ValueError(f"Session {session_id} not found")

        self.conversations[session_id]['messages'].append({
            'role': message.role,
            'content': message.content,
            'timestamp': message.timestamp.isoformat() if message.timestamp else datetime.now().isoformat()
        })
        self.conversations[session_id]['last_active'] = datetime.now()

        # Trim conversation if too long
        messages = self.conversations[session_id]['messages']
        if len(messages) > config.MAX_CONVERSATION_LENGTH * 2:  # *2 for user+assistant pairs
            self.conversations[session_id]['messages'] = messages[-config.MAX_CONVERSATION_LENGTH * 2:]

    def get_conversation_context(self, session_id: str, max_pairs: int = None) -> str:
        """Get conversation context for LLM"""
        if session_id not in self.conversations:
            return ""

        messages = self.conversations[session_id]['messages']
        if not messages:
            return ""

        # Get last N pairs (user + assistant messages)
        max_pairs = max_pairs or config.MAX_CONTEXT_LENGTH
        recent_messages = messages[-(max_pairs * 2):]

        context_parts = []
        for msg in recent_messages:
            role = "Human" if msg['role'] == 'user' else "Assistant"
            context_parts.append(f"{role}: {msg['content']}")

        return "\n".join(context_parts)

    def get_conversation_length(self, session_id: str) -> int:
        """Get number of message exchanges"""
        if session_id not in self.conversations:
            return 0
        return len(self.conversations[session_id]['messages'])

    def save_conversation(self, session_id: str):
        """Save conversation to disk"""
        if session_id not in self.conversations:
            return

        conv_file = os.path.join(config.CONVERSATIONS_PATH, f"{session_id}.json")
        save_metadata(self.conversations[session_id], conv_file)

conversation_manager = ConversationManager()
print("✅ Conversation manager initialized!")

# ========================================
# STEP 6: WEB SCRAPING AND INDEXING
# ========================================

class WebScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def extract_content(self, url: str) -> Dict[str, Any]:
        """Extract content from URL with retry logic"""
        for attempt in range(config.MAX_RETRIES):
            try:
                response = self.session.get(url, timeout=config.REQUEST_TIMEOUT)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Remove unwanted elements
                for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
                    element.decompose()

                # Extract title
                title = soup.find('title')
                title_text = title.get_text().strip() if title else url

                # Extract main content
                content_selectors = [
                    'article', 'main', '[role="main"]',
                    '.content', '.post', '.article',
                    'div.container', 'div.wrapper'
                ]

                content_text = ""
                for selector in content_selectors:
                    content = soup.select_one(selector)
                    if content:
                        content_text = content.get_text()
                        break

                if not content_text:
                    # Fallback to body
                    body = soup.find('body')
                    content_text = body.get_text() if body else ""

                # Clean text
                content_text = re.sub(r'\s+', ' ', content_text).strip()

                if not content_text:
                    raise ValueError("No content extracted")

                return {
                    'title': title_text,
                    'content': content_text,
                    'url': url,
                    'success': True,
                    'error': None
                }

            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
                if attempt < config.MAX_RETRIES - 1:
                    time.sleep(config.RETRY_DELAY * (2 ** attempt))
                else:
                    return {
                        'title': None,
                        'content': None,
                        'url': url,
                        'success': False,
                        'error': str(e)
                    }

def build_bm25_from_vectorstore(vectorstore):
    """Build BM25 index from FAISS vectorstore"""
    try:
        docs = []
        sources = []
        for doc_id, doc in vectorstore.docstore._dict.items():
            text = getattr(doc, "page_content", None) or doc.page_content
            docs.append(text)
            sources.append(doc.metadata.get("source", "Unknown"))

        tokenized = [d.split() for d in docs]
        bm25 = BM25Okapi(tokenized)

        # Build text to docs mapping
        text_to_docs = {}
        for doc in vectorstore.docstore._dict.values():
            text = doc.page_content
            text_to_docs.setdefault(text, []).append(doc)

        logger.info(f"BM25 built over {len(docs)} chunks")
        return bm25, docs, sources, text_to_docs
    except Exception as e:
        logger.error(f"Error building BM25: {str(e)}")
        raise

print("✅ Web scraping and indexing functions defined!")

# ========================================
# STEP 7: HYBRID SEARCH SYSTEM
# ========================================

def hybrid_search(query: str, use_dynamic: bool = True, bm_k: int = 10, faiss_k: int = 10, top_k: int = 3, use_reranker: bool = True, conversation_context: str = ""):
    """Perform hybrid search on static and/or dynamic indices with conversation context"""
    try:
        # Enhanced query with context
        enhanced_query = query
        if conversation_context:
            # Add context to help with pronouns and references
            enhanced_query = f"Context: {conversation_context}\n\nCurrent question: {query}"

        all_candidates = []

        # Search static index
        if static_vectorstore:
            candidates_static = _search_single_index(
                enhanced_query, static_vectorstore, bm25_static, bm25_docs_static,
                bm25_sources_static, text_to_docs_static, bm_k, faiss_k
            )
            all_candidates.extend(candidates_static)

        # Search dynamic index
        if use_dynamic and dynamic_vectorstore:
            candidates_dynamic = _search_single_index(
                enhanced_query, dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic,
                bm25_sources_dynamic, text_to_docs_dynamic, bm_k, faiss_k
            )
            all_candidates.extend(candidates_dynamic)

        if not all_candidates:
            return []

        # Merge and deduplicate candidates
        merged = OrderedDict()
        for candidate in all_candidates:
            key = candidate["text"].strip()[:100]  # Use first 100 chars as key
            if key not in merged or (candidate.get("bm25_score") or 0) > (merged[key].get("bm25_score") or 0):
                merged[key] = candidate

        candidates = list(merged.values())

        # Optional reranking - use original query for reranking
        if use_reranker and reranker and candidates:
            pairs = [(query, c["text"]) for c in candidates]  # Use original query for reranking
            scores = reranker.predict(pairs)
            for c, s in zip(candidates, scores):
                c["rerank_score"] = float(s)
            candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
        else:
            candidates = sorted(candidates, key=lambda x: (0 if x["bm25_score"] is not None else 1, -(x["bm25_score"] or 0)))

        # Return top_k Document objects
        final_docs = []
        for c in candidates[:top_k]:
            text = c["text"]
            # Try to find in both mappings
            docs_list = text_to_docs_static.get(text, []) if text_to_docs_static else []
            if not docs_list and text_to_docs_dynamic:
                docs_list = text_to_docs_dynamic.get(text, [])

            if docs_list:
                final_docs.append(docs_list[0])
            else:
                final_docs.append(Document(page_content=text, metadata={"source": c.get("source", "Unknown")}))

        return final_docs

    except Exception as e:
        logger.error(f"Error in hybrid search: {str(e)}")
        raise

def _search_single_index(query: str, vectorstore, bm25, bm25_docs, bm25_sources, text_to_docs, bm_k: int, faiss_k: int):
    """Search a single index (static or dynamic)"""
    candidates = []

    # BM25 candidates
    if bm25 and bm25_docs:
        tokenized_q = query.split()
        bm25_scores = bm25.get_scores(tokenized_q)
        bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:bm_k]
        for idx in bm25_indices:
            txt = bm25_docs[idx]
            src = bm25_sources[idx]
            candidates.append({"text": txt, "source": src, "bm25_score": bm25_scores[idx]})

    # FAISS candidates
    if vectorstore:
        faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": faiss_k})
        faiss_docs = faiss_retriever.get_relevant_documents(query)
        for d in faiss_docs:
            candidates.append({"text": d.page_content, "source": d.metadata.get("source", "Unknown"), "bm25_score": None})

    return candidates

print("✅ Hybrid search system defined!")

# ========================================
# STEP 8: EVALUATION SYSTEM
# ========================================

class RAGEvaluator:
    def __init__(self):
        self.test_cases = []
        self.results = []

    def create_test_dataset(self):
        """Create evaluation test cases"""
        test_cases = [
            # Context retention tests
            {
                "conversation": [
                    {"role": "user", "content": "What is attention mechanism in transformers?"},
                    {"role": "user", "content": "How does it help with long sequences?"}
                ],
                "expected_context": "attention mechanism",
                "test_type": "context_retention"
            },

            # Citation accuracy tests
            {
                "conversation": [
                    {"role": "user", "content": "Tell me about RAG systems"}
                ],
                "expected_citations": True,
                "test_type": "citation_accuracy"
            },

            # Topic switching tests
            {
                "conversation": [
                    {"role": "user", "content": "Explain neural networks"},
                    {"role": "user", "content": "Now tell me about cooking recipes"}
                ],
                "expected_behavior": "topic_switch",
                "test_type": "topic_switching"
            },

            # Multi-source tests
            {
                "conversation": [
                    {"role": "user", "content": "What are the latest trends in AI?"}
                ],
                "expected_sources": "mixed",
                "test_type": "multi_source"
            }
        ]

        # Save test cases
        test_file = os.path.join(config.EVALUATION_DATASET_PATH, "test_cases.json")
        save_metadata(test_cases, test_file)
        return test_cases

    def evaluate_response_relevance(self, question: str, answer: str, context_docs: List[Document]) -> float:
        """Evaluate how relevant the answer is to the question"""
        try:
            # Simple keyword overlap score
            question_words = set(question.lower().split())
            answer_words = set(answer.lower().split())

            overlap = len(question_words.intersection(answer_words))
            relevance_score = overlap / len(question_words) if question_words else 0.0

            # Bonus for using context
            context_text = " ".join([doc.page_content for doc in context_docs])
            context_words = set(context_text.lower().split())
            context_usage = len(answer_words.intersection(context_words)) / len(context_words) if context_words else 0.0

            final_score = min(1.0, (relevance_score + context_usage) / 2)
            return final_score

        except Exception as e:
            logger.error(f"Error evaluating relevance: {str(e)}")
            return 0.0

    def evaluate_citation_accuracy(self, answer: str, sources: List[str]) -> float:
        """Evaluate citation accuracy"""
        if not sources:
            return 0.0

        # Check if answer contains factual claims (simple heuristic)
        factual_indicators = ["according to", "research shows", "studies indicate", "data reveals", "analysis found"]
        has_factual_claims = any(indicator in answer.lower() for indicator in factual_indicators)

        if has_factual_claims and sources:
            return 1.0
        elif not has_factual_claims:
            return 0.8  # No factual claims, so no citations needed
        else:
            return 0.0  # Factual claims but no citations

    def evaluate_context_retention(self, conversation_history: List[Dict], current_answer: str) -> float:
        """Evaluate how well context from previous messages is retained"""
        if len(conversation_history) <= 1:
            return 1.0  # No previous context to retain

        # Look for references to previous topics
        previous_content = " ".join([msg["content"] for msg in conversation_history[:-1]])
        previous_words = set(previous_content.lower().split())
        answer_words = set(current_answer.lower().split())

        # Check for pronouns and references
        references = ["this", "that", "it", "they", "these", "those"]
        has_references = any(ref in current_answer.lower() for ref in references)

        # Calculate context retention score
        word_overlap = len(previous_words.intersection(answer_words)) / len(previous_words) if previous_words else 0.0
        reference_bonus = 0.3 if has_references else 0.0

        context_score = min(1.0, word_overlap + reference_bonus)
        return context_score

    async def run_evaluation(self, test_cases: List[Dict]) -> Dict[str, Any]:
        """Run comprehensive evaluation"""
        results = []
        total_scores = {"relevance": [], "citation": [], "context": []}

        for i, test_case in enumerate(test_cases):
            try:
                logger.info(f"Running test case {i+1}/{len(test_cases)}")

                # Simulate conversation
                session_id = generate_session_id()
                conversation_history = []

                for message in test_case["conversation"]:
                    # Add user message
                    user_msg = ChatMessage(role="user", content=message["content"])
                    conversation_manager.add_message(session_id, user_msg)
                    conversation_history.append({"role": "user", "content": message["content"]})

                    # Get bot response
                    context = conversation_manager.get_conversation_context(session_id)
                    docs = hybrid_search(
                        message["content"],
                        use_dynamic=True,
                        conversation_context=context
                    )

                    if docs:
                        doc_context = "\n\n".join([doc.page_content for doc in docs])
                        formatted_prompt = chat_prompt.format(
                            conversation_context=context,
                            context=doc_context,
                            question=message["content"]
                        )
                        response = llm.invoke(formatted_prompt)
                        answer = response.content.strip()
                        sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
                        sources = format_sources(sources)
                    else:
                        answer = "I couldn't find specific information about that in my knowledge base."
                        sources = []

                    # Add assistant message
                    assistant_message = ChatMessage(role="assistant", content=answer)
                    conversation_manager.add_message(session_id, assistant_message)
                    conversation_history.append({"role": "assistant", "content": answer})

                # Evaluate the last response
                last_question = test_case["conversation"][-1]["content"]
                last_answer = conversation_history[-1]["content"]

                # Calculate scores
                relevance_score = self.evaluate_response_relevance(last_question, last_answer, docs if docs else [])
                citation_score = self.evaluate_citation_accuracy(last_answer, sources if sources else [])
                context_score = self.evaluate_context_retention(conversation_history[:-1], last_answer)

                result = {
                    "test_case_id": i,
                    "test_type": test_case.get("test_type", "general"),
                    "question": last_question,
                    "answer": last_answer,
                    "sources": sources if sources else [],
                    "scores": {
                        "relevance": relevance_score,
                        "citation": citation_score,
                        "context_retention": context_score
                    },
                    "overall_score": (relevance_score + citation_score + context_score) / 3
                }

                results.append(result)
                total_scores["relevance"].append(relevance_score)
                total_scores["citation"].append(citation_score)
                total_scores["context"].append(context_score)

            except Exception as e:
                logger.error(f"Error in test case {i}: {str(e)}")
                continue

        # Calculate overall metrics
        metrics = {
            "average_relevance": np.mean(total_scores["relevance"]) if total_scores["relevance"] else 0.0,
            "average_citation": np.mean(total_scores["citation"]) if total_scores["citation"] else 0.0,
            "average_context": np.mean(total_scores["context"]) if total_scores["context"] else 0.0,
            "overall_average": np.mean([np.mean(scores) for scores in total_scores.values()]) if any(total_scores.values()) else 0.0,
            "test_cases_completed": len(results),
            "test_cases_failed": len(test_cases) - len(results)
        }

        return {
            "overall_score": metrics["overall_average"],
            "detailed_results": results,
            "metrics": metrics
        }

evaluator = RAGEvaluator()
print("✅ Evaluation system initialized!")

# ========================================
# MODIFIED: MODEL INITIALIZATION
# ========================================

def initialize_models():
    """Initialize all models and components for AWS deployment"""
    global embedding_model, static_vectorstore, dynamic_vectorstore, llm
    global bm25_static, bm25_dynamic, bm25_docs_static, bm25_docs_dynamic
    global bm25_sources_static, bm25_sources_dynamic, reranker
    global text_to_docs_static, text_to_docs_dynamic, chat_prompt, text_splitter

    try:
        print("🔄 Loading models for AWS deployment...")
        ensure_directories()
        clean_old_conversations()

        # Load embedding model
        embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        print("✅ Embedding model loaded")

        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        print("✅ Text splitter initialized")

        # Try to load static FAISS vectorstore (may not exist initially)
        try:
            if os.path.exists(config.STATIC_FAISS_PATH):
                static_vectorstore = FAISS.load_local(
                    config.STATIC_FAISS_PATH,
                    embedding_model,
                    allow_dangerous_deserialization=True
                )
                bm25_static, bm25_docs_static, bm25_sources_static, text_to_docs_static = build_bm25_from_vectorstore(static_vectorstore)
                print("✅ Static FAISS vectorstore loaded")
            else:
                print("ℹ️ No static FAISS index found - will work with dynamic index only")
        except Exception as e:
            print(f"⚠️ Could not load static FAISS: {str(e)}")

        # Try to load dynamic FAISS vectorstore (may not exist initially)
        try:
            if os.path.exists(config.DYNAMIC_FAISS_PATH):
                dynamic_vectorstore = FAISS.load_local(
                    config.DYNAMIC_FAISS_PATH,
                    embedding_model,
                    allow_dangerous_deserialization=True
                )
                bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)
                print("✅ Dynamic FAISS vectorstore loaded")
            else:
                print("ℹ️ No dynamic FAISS index found - will create on first index")
        except Exception as e:
            print(f"ℹ️ Dynamic FAISS not found (will create on first index): {str(e)}")

        # Initialize LLM with environment variable
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            google_api_key=os.environ["GOOGLE_API_KEY"]
        )
        print("✅ LLM initialized")

        # Create chat prompt template
        chat_prompt_template = """
You are a helpful AI assistant with access to multiple knowledge sources. You can maintain context across conversations and provide accurate citations.

Previous conversation context:
{conversation_context}

Current context from knowledge base:
{context}

Current question:
{question}

Instructions:
1. Use the conversation context to understand references like "it", "this", "that", etc.
2. Provide accurate answers based on the knowledge base context
3. If you reference specific information, it should be from the provided context
4. If the answer is not in the knowledge base, say: "I couldn't find specific information about that in my knowledge base."
5. Be conversational and natural in your responses
6. Handle follow-up questions by connecting them to previous context when appropriate

Answer:
        """
        chat_prompt = PromptTemplate(
            input_variables=["conversation_context", "context", "question"],
            template=chat_prompt_template
        )
        print("✅ Chat prompt template created")

        # Load reranker
        if config.USE_RERANKER:
            reranker = CrossEncoder(config.RERANKER_MODEL)
            print("✅ Reranker loaded")

        print("🎉 All models initialized successfully!")

    except Exception as e:
        print(f"❌ Error initializing models: {str(e)}")
        raise

# ========================================
# MODIFIED: SERVER STARTUP FOR AWS
# ========================================

def start_aws_server():
    """Start the FastAPI server for AWS deployment"""
    print("=" * 60)
    print("🚀 CONVERSATIONAL RAG SYSTEM - AWS DEPLOYMENT")
    print("=" * 60)
    print("🔑 API Keys:")
    print("   • demo-api-key-123 (full access)")
    print("   • eval-key-456 (evaluation access)")
    print("")
    print("🎯 ENDPOINTS will be available at:")
    print("  • POST /api/v1/chat - Chat with the system")
    print("  • POST /api/v1/index - Index new URLs")
    print("  • GET  /api/v1/sources - List all sources")
    print("  • GET  /api/v1/sources/{hash} - Get source details")
    print("  • POST /api/v1/evaluate - Run automated evaluation")
    print("  • GET  /api/v1/conversations - List conversations")
    print("  • GET  /health - Health check")
    print("")
    print("🌐 Server starting on 0.0.0.0:8000...")
    print("📚 API docs will be at: http://your-ec2-ip:8000/docs")
    print("=" * 60)

    # Start the server without ngrok
    uvicorn.run(app, host="0.0.0.0", port=8000)

# ========================================
# FASTAPI APPLICATION (Same as before)
# ========================================

app = FastAPI(
    title="Conversational RAG API with Dynamic Indexing and Sources",
    description="Hybrid Retrieval-Augmented Generation API with Conversation Support, BM25 + FAISS, Dynamic Web Indexing, and Source Management",
    version="3.1.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

security = HTTPBearer()

# Authentication (same as before)
async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
    """Verify API key from Authorization header"""
    api_key = credentials.credentials
    if api_key not in config.VALID_API_KEYS:
        logger.warning(f"Invalid API key attempted: {api_key[:10]}...")
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid API key",
            headers={"WWW-Authenticate": "Bearer"},
        )
    return config.VALID_API_KEYS[api_key]

# ========================================
# ========================================
# STEP 11: API ENDPOINTS
# ========================================

@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    clean_old_conversations()  # Cleanup old conversations on health check

    components = {
        "static_vectorstore": static_vectorstore is not None,
        "dynamic_vectorstore": dynamic_vectorstore is not None,
        "llm": llm is not None,
        "bm25_static": bm25_static is not None,
        "bm25_dynamic": bm25_dynamic is not None,
        "reranker": reranker is not None if config.USE_RERANKER else "disabled",
        "conversation_manager": True
    }
    status = "healthy" if llm is not None else "unhealthy"
    return HealthResponse(
        status=status,
        components=components,
        conversations_active=len(active_conversations)
    )

@app.post("/api/v1/chat", response_model=ChatResponse)
async def chat_with_rag(
    request: ChatRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Main conversational RAG endpoint"""
    try:
        # Check permissions
        if "chat" not in user_info.get("permissions", []):
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Insufficient permissions for chat"
            )

        logger.info(f"Chat request from user {user_info['user']}")

        # Get or create session
        session_id = conversation_manager.get_or_create_session(request.session_id)

        # Add conversation history to session (except the last message which is the current question)
        for message in request.messages[:-1]:
            conversation_manager.add_message(session_id, message)

        # Get current question
        current_message = request.messages[-1]
        if current_message.role != "user":
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Last message must be from user"
            )

        # Add current question to conversation
        conversation_manager.add_message(session_id, current_message)

        # Get conversation context for retrieval and generation
        conversation_context = conversation_manager.get_conversation_context(session_id)

        # Perform hybrid search with conversation context
        docs = hybrid_search(
            current_message.content,
            use_dynamic=request.use_dynamic_index,
            bm_k=config.BM25_TOP_K,
            faiss_k=config.FAISS_TOP_K,
            top_k=request.top_k,
            use_reranker=request.use_reranker and config.USE_RERANKER,
            conversation_context=conversation_context
        )

        if not docs:
            answer = "I couldn't find specific information about that in my knowledge base."
            sources = []
        else:
            # Create context and query LLM
            doc_context = "\n\n".join([doc.page_content for doc in docs])
            formatted_prompt = chat_prompt.format(
                conversation_context=conversation_context,
                context=doc_context,
                question=current_message.content
            )

            response = llm.invoke(formatted_prompt)
            answer = response.content.strip()

            # Get unique sources
            sources = list(OrderedDict.fromkeys(doc.metadata.get("source", "Unknown") for doc in docs))
            sources = format_sources(sources)

        # Add assistant response to conversation
        assistant_message = ChatMessage(role="assistant", content=answer)
        conversation_manager.add_message(session_id, assistant_message)

        # Save conversation periodically
        if conversation_manager.get_conversation_length(session_id) % 4 == 0:  # Every 4 messages
            conversation_manager.save_conversation(session_id)

        # UPDATED RESPONSE STRUCTURE WITH ENHANCED SOURCES
        response_data = {
            "answer": {
                "content": answer,
                "role": "assistant"
            },
            "sources": sources,  # Make sources more prominent
            "citations": sources,  # Keep backward compatibility
            "retrieved_documents": [
                {
                    "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
                    "source": doc.metadata.get("source", "Unknown"),
                    "title": doc.metadata.get("title", ""),
                    "chunk_id": doc.metadata.get("chunk_id", 0)
                } for doc in docs
            ] if docs else [],
            "metadata": {
                "num_docs_retrieved": len(docs),
                "num_sources": len(sources),
                "reranker_used": request.use_reranker and config.USE_RERANKER,
                "dynamic_index_used": request.use_dynamic_index,
                "conversation_length": conversation_manager.get_conversation_length(session_id),
                "user": user_info["user"]
            }
        }

        return ChatResponse(
            session_id=session_id,
            response=response_data,
            conversation_length=conversation_manager.get_conversation_length(session_id)
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing chat: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}"
        )

# NEW: Sources management endpoints
@app.get("/api/v1/sources", response_model=SourcesResponse)
async def get_sources(user_info: dict = Depends(verify_api_key)):
    """Get all available sources in the system"""
    try:
        # Check permissions
        if "read" not in user_info.get("permissions", []):
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN,
                detail="Insufficient permissions to view sources"
            )

        static_sources = []
        dynamic_sources = []

        # Get static sources
        if static_vectorstore:
            static_source_map = {}
            for doc_id, doc in static_vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "Unknown")
                title = doc.metadata.get("title", "")

                if source not in static_source_map:
                    static_source_map[source] = {
                        "count": 0,
                        "title": title
                    }
                static_source_map[source]["count"] += 1

            for source, info in static_source_map.items():
                # Convert file paths to URLs using source_url_map
                display_source = source_url_map.get(source, source)
                static_sources.append(SourceInfo(
                    source_url=display_source,
                    title=info["title"],
                    document_count=info["count"],
                    source_type="static",
                    indexed_at=None,  # Static sources don't have index timestamp
                    last_updated=None
                ))

        # Get dynamic sources
        if dynamic_vectorstore:
            dynamic_source_map = {}
            for doc_id, doc in dynamic_vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "Unknown")
                title = doc.metadata.get("title", "")
                indexed_at = doc.metadata.get("indexed_at")

                if source not in dynamic_source_map:
                    dynamic_source_map[source] = {
                        "count": 0,
                        "title": title,
                        "indexed_at": indexed_at
                    }
                dynamic_source_map[source]["count"] += 1

                # Keep the most recent indexed_at
                if indexed_at and (not dynamic_source_map[source]["indexed_at"] or indexed_at > dynamic_source_map[source]["indexed_at"]):
                    dynamic_source_map[source]["indexed_at"] = indexed_at

            for source, info in dynamic_source_map.items():
                indexed_datetime = None
                if info["indexed_at"]:
                    try:
                        indexed_datetime = datetime.fromisoformat(info["indexed_at"]) if isinstance(info["indexed_at"], str) else info["indexed_at"]
                    except:
                        indexed_datetime = None

                dynamic_sources.append(SourceInfo(
                    source_url=source,
                    title=info["title"],
                    document_count=info["count"],
                    source_type="dynamic",
                    indexed_at=indexed_datetime,
                    last_updated=indexed_datetime
                ))

        # Sort sources by document count (descending)
        static_sources.sort(key=lambda x: x.document_count, reverse=True)
        dynamic_sources.sort(key=lambda x: x.document_count, reverse=True)

        return SourcesResponse(
            total_sources=len(static_sources) + len(dynamic_sources),
            static_sources=static_sources,
            dynamic_sources=dynamic_sources
        )

    except Exception as e:
        logger.error(f"Error retrieving sources: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to retrieve sources: {str(e)}"
        )

@app.get("/api/v1/sources/{source_hash}")
async def get_source_details(
    source_hash: str,
    user_info: dict = Depends(verify_api_key)
):
    """Get detailed information about a specific source"""
    # Check permissions
    if "read" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions to view source details"
        )

    # Find source by hash or URL
    source_found = False
    source_details = {
        "source_url": None,
        "title": None,
        "chunks": [],
        "source_type": None,
        "indexed_at": None
    }

    # Search in both vectorstores
    for vectorstore, source_type in [(static_vectorstore, "static"), (dynamic_vectorstore, "dynamic")]:
        if vectorstore:
            for doc_id, doc in vectorstore.docstore._dict.items():
                source = doc.metadata.get("source", "")
                url_hash = get_url_hash(source)

                if url_hash == source_hash or source.endswith(source_hash):
                    source_found = True
                    source_details.update({
                        "source_url": source_url_map.get(source, source),
                        "title": doc.metadata.get("title", ""),
                        "source_type": source_type,
                        "indexed_at": doc.metadata.get("indexed_at")
                    })

                    source_details["chunks"].append({
                        "chunk_id": doc.metadata.get("chunk_id", 0),
                        "content_preview": doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content,
                        "content_length": len(doc.page_content)
                    })

    if not source_found:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Source not found"
        )

    return source_details

@app.post("/api/v1/index", response_model=IndexResponse)
async def index_urls(
    request: IndexRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Index URLs into the dynamic vector database"""
    global dynamic_vectorstore, bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic

    # Check permissions
    if "index" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for indexing"
        )

    logger.info(f"Indexing request from {user_info['user']}: {len(request.url)} URLs")

    scraper = WebScraper()
    indexed_urls = []
    failed_urls = []

    try:
        new_documents = []

        for url in request.url:
            try:
                logger.info(f"Processing URL: {url}")

                # Extract content
                result = scraper.extract_content(url)

                if not result['success']:
                    failed_urls.append({
                        "url": url,
                        "error": result['error'],
                        "error_type": "EXTRACTION_FAILED"
                    })
                    continue

                # Split content into chunks
                chunks = text_splitter.split_text(result['content'])

                # Create documents
                for i, chunk in enumerate(chunks):
                    doc = Document(
                        page_content=chunk,
                        metadata={
                            "source": url,
                            "title": result['title'],
                            "chunk_id": i,
                            "total_chunks": len(chunks),
                            "indexed_at": datetime.now().isoformat(),
                            "url_hash": get_url_hash(url)
                        }
                    )
                    new_documents.append(doc)

                indexed_urls.append(url)
                logger.info(f"✅ Successfully processed {url} - {len(chunks)} chunks")

            except Exception as e:
                logger.error(f"Error processing {url}: {str(e)}")
                failed_urls.append({
                    "url": url,
                    "error": str(e),
                    "error_type": "PROCESSING_ERROR"
                })

        # Update vector database if we have new documents
        if new_documents:
            try:
                if dynamic_vectorstore is None:
                    # Create new FAISS index
                    dynamic_vectorstore = FAISS.from_documents(new_documents, embedding_model)
                    logger.info("✅ Created new dynamic FAISS index")
                else:
                    # Add to existing index
                    dynamic_vectorstore.add_documents(new_documents)
                    logger.info(f"✅ Added {len(new_documents)} documents to existing index")

                # Save updated index
                dynamic_vectorstore.save_local(config.DYNAMIC_FAISS_PATH)

                # Rebuild BM25 and mappings
                bm25_dynamic, bm25_docs_dynamic, bm25_sources_dynamic, text_to_docs_dynamic = build_bm25_from_vectorstore(dynamic_vectorstore)

                logger.info(f"✅ Dynamic index updated")

            except Exception as e:
                logger.error(f"Error updating vector database: {str(e)}")
                raise HTTPException(
                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                    detail=f"Failed to update vector database: {str(e)}"
                )

        # Prepare response
        response_status = "success" if indexed_urls else "failed"
        if indexed_urls and failed_urls:
            response_status = "partial_success"

        metadata = {
            "total_requested": len(request.url),
            "successfully_indexed": len(indexed_urls),
            "failed": len(failed_urls),
            "new_documents_added": len(new_documents),
            "user": user_info["user"]
        }

        return IndexResponse(
            status=response_status,
            indexed_url=indexed_urls,
            failed_url=failed_urls if failed_urls else None,
            metadata=metadata
        )

    except Exception as e:
        logger.error(f"Critical error in indexing: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Indexing failed: {str(e)}"
        )

@app.post("/api/v1/evaluate", response_model=EvaluationResponse)
async def evaluate_system(
    request: EvaluationRequest,
    user_info: dict = Depends(verify_api_key)
):
    """Run automated evaluation of the RAG system"""
    # Check permissions
    if "eval" not in user_info.get("permissions", []):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Insufficient permissions for evaluation"
        )

    logger.info(f"Evaluation request from user {user_info['user']}")

    try:
        # Use provided test cases or create default ones
        test_cases = request.test_cases if request.test_cases else evaluator.create_test_dataset()

        # Run evaluation
        evaluation_results = await evaluator.run_evaluation(test_cases)

        return EvaluationResponse(
            overall_score=evaluation_results["overall_score"],
            detailed_results=evaluation_results["detailed_results"],
            metrics=evaluation_results["metrics"]
        )

    except Exception as e:
        logger.error(f"Error running evaluation: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Evaluation failed: {str(e)}"
        )

@app.get("/api/v1/conversations/{session_id}")
async def get_conversation(
    session_id: str,
    user_info: dict = Depends(verify_api_key)
):
    """Get conversation history"""
    if session_id not in active_conversations:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Conversation not found"
        )

    conversation_data = active_conversations[session_id]
    return {
        "session_id": session_id,
        "messages": conversation_data["messages"],
        "created_at": conversation_data["created_at"],
        "last_active": conversation_data["last_active"],
        "message_count": len(conversation_data["messages"])
    }

@app.delete("/api/v1/conversations/{session_id}")
async def delete_conversation(
    session_id: str,
    user_info: dict = Depends(verify_api_key)
):
    """Delete a conversation"""
    if session_id not in active_conversations:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Conversation not found"
        )

    del active_conversations[session_id]
    return {"status": "deleted", "session_id": session_id}

@app.get("/api/v1/conversations")
async def list_conversations(user_info: dict = Depends(verify_api_key)):
    """List active conversations"""
    clean_old_conversations()

    conversations_summary = []
    for session_id, conv_data in active_conversations.items():
        conversations_summary.append({
            "session_id": session_id,
            "message_count": len(conv_data["messages"]),
            "created_at": conv_data["created_at"],
            "last_active": conv_data["last_active"],
            "last_message_preview": conv_data["messages"][-1]["content"][:100] if conv_data["messages"] else ""
        })

    return {
        "active_conversations": len(conversations_summary),
        "conversations": conversations_summary
    }

print("✅ API endpoints defined!")
# ========================================
# MAIN EXECUTION
# ========================================

if __name__ == "__main__":
    print("🔄 Initializing conversational RAG system for AWS...")

    # Initialize models first
    initialize_models()

    # Create demo test cases
    print("\n📝 Creating demo test cases...")
    # create_demo_test_cases()  # Include this function from your original code

    # Print startup info
    print("\n🎓 SYSTEM READY FOR DEPLOYMENT!")
    print("✅ All models loaded")
    print("✅ API endpoints configured")
    print("✅ Ready for recruiter demos")

    # Start the AWS server
    start_aws_server()