In [0]:
from pyspark.sql import SparkSession

# Initialize Spark (already exists in Databricks, but explicit is good practice)
spark = SparkSession.builder.getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0xffe1d064b470>

In [0]:
%pip install -qU \
    langchain \
    langchain-community \
    langchain-google-genai \
    langchain-chroma \
    chromadb \
    pymupdf \
    sentence-transformers

# Restart Python so new packages are available
dbutils.library.restartPython()


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%pip install -U langchain-chroma
dbutils.library.restartPython()


Collecting langchain-chroma
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading langchain_chroma-1.1.0-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-1.1.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
import os
import getpass

# Securely set Gemini API Key
os.environ["GOOGLE_API_KEY"] = getpass.getpass(
    "Enter your Google Gemini API Key: "
)


Enter your Google Gemini API Key:  [REDACTED]

In [0]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Path to your PDF (Databricks workspace path)
PDF_PATH = "/Workspace/Users/chaitanyapp03@gmail.com/Bill sample breakout.pdf"

print(f"ðŸ“„ Loading PDF: {PDF_PATH}")

# Load PDF
loader = PyMuPDFLoader(PDF_PATH)
docs = loader.load()

# Split text into overlapping chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

splits = text_splitter.split_documents(docs)

# Add metadata for traceability
for i, doc in enumerate(splits):
    doc.metadata["chunk_id"] = i
    doc.metadata["source"] = PDF_PATH

print(f"âœ… Loaded {len(docs)} pages â†’ {len(splits)} chunks created")


ðŸ“„ Loading PDF: /Workspace/Users/chaitanyapp03@gmail.com/Bill sample breakout.pdf
âœ… Loaded 1 pages â†’ 4 chunks created


In [0]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Lightweight, fast, production-proven embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)


  embeddings = HuggingFaceEmbeddings(


In [0]:
from langchain_chroma import Chroma

# Persist directory (DBFS survives cluster restarts)
PERSIST_DIR = "/tmp/chroma/simple_gemini_rag"

print("Creating persistent Chroma vector store...")

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    collection_name="invoice_rag",
    persist_directory=PERSIST_DIR
)

print("âœ… ChromaDB indexed and persisted")


Creating persistent Chroma vector store...
âœ… ChromaDB indexed and persisted


In [0]:
# vectorstore = Chroma(
#     collection_name="invoice_rag",
#     embedding_function=embeddings,
#     persist_directory=PERSIST_DIR
# )


In [0]:
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

# Configure Gemini client
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# print("Available Gemini models:")
# for m in genai.list_models():
#     if "generateContent" in m.supported_generation_methods:
#         print(m.name)

# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0 
)


Available Gemini models:
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/gemini-2.5-computer-use-preview-1

In [0]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# Retriever: semantic search over embeddings
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)

# Format retrieved documents with page numbers
def format_docs(docs):
    return "\n\n".join(
        f"[Page {d.metadata.get('page', 'N/A')}] {d.page_content}"
        for d in docs
    )

# RAG prompt (strict grounding)
prompt = ChatPromptTemplate.from_template("""
You are a helpful AI assistant.
Answer the question using ONLY the provided context. Reply like you are having a conversation with the user.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}
""")

# RAG Chain: retrieve, format, prompt, LLM, parse
rag_chain = (
    RunnableParallel(
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
    )
    | prompt
    | llm 
    | StrOutputParser()
)


In [0]:
def ask_with_citations(question: str):
    """
    Ask a question against the document and
    return answer + source metadata
    """
    docs = retriever.invoke(question)
    answer = rag_chain.invoke(question)

    sources = [
        {
            "page": d.metadata.get("page"),
            "chunk_id": d.metadata.get("chunk_id"),
            "source": d.metadata.get("source")
        }
        for d in docs
    ]

    return answer, "Sources:\n".join(
        f"- Page {s['page']} (Chunk ID: {s['chunk_id']}) from {s['source']}"
        for s in sources    
    )


In [0]:
response = ask_with_citations(
    "What is the total bill amount?"
)
response


'The total bill amount is $121.23.'

In [0]:
response = ask_with_citations(
    "What is the last date to pay the bill and what are the charges for late payment?"
)
response

'The last date to pay your bill is December 21, 2020. If payment is not received by this date, a late payment charge of 1.5% compounded monthly (which is 19.56% per year) will be calculated from the statement date and applied to your account.'

In [0]:
response = ask_with_citations(
    "Can you break down the total bill charges?"
)
response

"Looking at your statement, the total amount you owe is $121.23. This is entirely made up of your current electricity charges, which are also $121.23.\n\nIt also shows that there was a balance of $99.41 from your previous period, but this amount was received on November 20, 2020, so it's not part of the current amount due."