In [25]:
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os

print("Modern LangChain v1+ - Union Budget RAG ready!")

print("Union Budget 2025-26 RAG Analysis")
print("Loading ALL PDFs from data/pdfs/")

Modern LangChain v1+ - Union Budget RAG ready!
Union Budget 2025-26 RAG Analysis
Loading ALL PDFs from data/pdfs/


In [27]:
from langchain_community.document_loaders import PyPDFLoader

# Load just 3 key budget files to understand
pdf_dir = "../data/pdfs/"
key_files = ['Budget_Speech.pdf', 'budget_at_a_glance.pdf', 'allafs.pdf']

docs = []
for pdf_file in key_files:
    pdf_path = os.path.join(pdf_dir, pdf_file)
    if os.path.exists(pdf_path):
        loader = PyPDFLoader(pdf_path)
        file_docs = loader.load()
        docs.extend(file_docs)
        print(f"{pdf_file}: {len(file_docs)} pages loaded")
    else:
        print(f"{pdf_file} not found")

print(f"\nTOTAL: {len(docs)} documents loaded")
print(f"First doc source: {os.path.basename(docs[0].metadata['source']) if docs else 'None'}")
print(f"First doc preview: {docs[0].page_content[:200]}...")

Budget_Speech.pdf: 60 pages loaded
budget_at_a_glance.pdf: 23 pages loaded
allafs.pdf: 18 pages loaded

TOTAL: 101 documents loaded
First doc source: Budget_Speech.pdf
First doc preview: GOVERNMENT OF INDIA
BUDGET 2025-2026
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2025...


In [28]:
# breakdown by document

sources = {}
for doc in docs:
    source = os.path.basename(doc.metadata['source'])
    sources[source] = sources.get(source, 0) + 1

print("Pages by document:")
for source, count in sources.items():
    print(f"  {source}: {count} pages")

print(f"\nFirst page length: {len(docs[0].page_content)} characters")
print(f"Sample from page 2:")
print(docs[1].page_content[:200] + "...")

Pages by document:
  Budget_Speech.pdf: 60 pages
  budget_at_a_glance.pdf: 23 pages
  allafs.pdf: 18 pages

First page length: 103 characters
Sample from page 2:
...


In [19]:
# Analyze what we loaded
sources = {}
page_lengths = []
for doc in docs:
    source = os.path.basename(doc.metadata['source'])
    sources[source] = sources.get(source, 0) + 1
    page_lengths.append(len(doc.page_content))

print("Pages by document:")
for source, count in sources.items():
    print(f"  {source}: {count} pages")

print(f"\nStats:")
print(f"  Total pages: {len(docs)}")
print(f"  Avg page length: {sum(page_lengths)/len(page_lengths):.0f} chars")
print(f"  Longest page: {max(page_lengths)} chars")

print(f"\nPage 2 sample (Budget Speech):")
print(docs[1].page_content[:300] + "...")

print(f"\nMetadata example:")
print(f"  Source: {docs[0].metadata['source']}")
print(f"  Keys: {list(docs[0].metadata.keys())}")

Pages by document:
  Budget_Speech.pdf: 60 pages
  budget_at_a_glance.pdf: 23 pages
  allafs.pdf: 18 pages

Stats:
  Total pages: 101
  Avg page length: 1959 chars
  Longest page: 5388 chars

Page 2 sample (Budget Speech):
...

Metadata example:
  Source: ../data/pdfs/Budget_Speech.pdf
  Keys: ['producer', 'creator', 'creationdate', 'author', 'moddate', 'title', 'source', 'total_pages', 'page', 'page_label']


### **Splitting the Text**

In [31]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

print("BEFORE splitting: 101 long pages")
print(f"Average page: {sum(len(d.page_content) for d in docs)//len(docs):,} chars")
print(f"Longest page: {max(len(d.page_content) for d in docs):,} chars")

# Create the splitter (tries to keep paragraphs together)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,        # Max 1500 chars per chunk
    chunk_overlap=200,      # 200 char overlap (context preservation)
    length_function=len,
    separators=["\n\n", "\n", " ", ""]  # Try: paragraphs → sentences → words
)

# SPLIT!
chunks = splitter.split_documents(docs)

print(f"\nAFTER splitting: {len(chunks)} small chunks")
print(f" Average chunk: {sum(len(c.page_content) for c in chunks)//len(chunks):,} chars")
print(f" Max chunk: {max(len(c.page_content) for c in chunks):,} chars")

print(f"\nFIRST CHUNK PREVIEW:")
print(chunks[0].page_content[:400] + "...")
print(f" From: {os.path.basename(chunks[0].metadata['source'])} (page {chunks[0].metadata.get('page', 'N/A')})")

BEFORE splitting: 101 long pages
Average page: 1,959 chars
Longest page: 5,388 chars

AFTER splitting: 191 small chunks
 Average chunk: 1,121 chars
 Max chunk: 1,500 chars

FIRST CHUNK PREVIEW:
GOVERNMENT OF INDIA
BUDGET 2025-2026
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2025...
 From: Budget_Speech.pdf (page 0)


### **Generating Embeddings**

Model - *nomic-embed-text*

Context limit - 2k

nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.

Source: *https://ollama.com/library/nomic-embed-text*

In [35]:
print(" Creating VECTOR DATABASE from chunks...")
print(f" {len(chunks)} chunks → numerical embeddings")

# Create embeddings using Ollama (local!)
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Save to your db/ folder
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="../db/union_budget_vectors"  # From notebooks/ → ../db/
)

print(" VECTOR DATABASE SAVED!")
print(f" Location: ../db/union_budget_vectors/")
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})  # Top 4 relevant chunks
print(" Retriever ready - finds most relevant budget chunks!")

 Creating VECTOR DATABASE from chunks...
 191 chunks → numerical embeddings
 VECTOR DATABASE SAVED!
 Location: ../db/union_budget_vectors/
 Retriever ready - finds most relevant budget chunks!


### **Testing with Llama2 Model**

Llama2 - 4k Context Limit

```
TOTAL BUCKET = 4096 tokens (input + output combined

llama2 TOTAL CAPACITY = 4096 tokens (input + output combined)

RAG INPUT consumes:
├── Question:           ~50 tokens    ("What is fiscal deficit?")
├── Prompt:            ~200 tokens    ("Use this context...")
└── 4 retrieved chunks: 1500 tokens  (375 tokens × k=4)
                             
TOTAL INPUT = ~1750 tokens

RAG Input:     1750 tokens (question + 4 chunks + prompt)
Available Output: 2346 tokens MAX  
TOTAL:         4096 tokens ✓
```

In [36]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Your llama2 model for answering
llm = OllamaLLM(model="llama2")

# RAG prompt template
template = """You are analyzing India's Union Budget 2025-26. 
Use ONLY the following budget document context to answer.

Context from Budget documents:
{context}

Question: {question}

Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# Build RAG chain (Modern LCEL)
rag_chain = (
    {"context": retriever, "question": lambda x: x}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG CHAIN BUILT!")
print("Ready to answer Union Budget questions!")
print("\n Examples to try:")
print('  "What is the fiscal deficit target?"')
print('  "What are the 4 engines of development?"') 
print('  "Total expenditure estimate?"')

RAG CHAIN BUILT!
Ready to answer Union Budget questions!

 Examples to try:
  "What is the fiscal deficit target?"
  "What are the 4 engines of development?"
  "Total expenditure estimate?"


In [38]:
# Test your RAG system!
questions = [
    "What is the fiscal deficit target for 2025-26?",
    "What is the total expenditure estimate?",
    "What are the four engines of development?",
    "Key agriculture initiatives mentioned?",
    "Who presented the budget?"
]

print("="*70)
print("UNION BUDGET 2025-26 RAG RESULTS")
print("="*70)

for question in questions:
    answer = rag_chain.invoke(question)
    print(f"\nQ: {question}")
    print(f"{answer[:400]}...")
    print("-" * 50)

UNION BUDGET 2025-26 RAG RESULTS

Q: What is the fiscal deficit target for 2025-26?
The fiscal deficit target for 2025-26 is estimated to be 4.4% of GDP....
--------------------------------------------------

Q: What is the total expenditure estimate?
The total expenditure estimate for the Union Budget 2025-26 is ₹8,607.46 billion (₹8,600 billion + ₹7.46 billion)....
--------------------------------------------------

Q: What are the four engines of development?
Based on the budget documents provided, the four engines of development in India's Union Budget 2025-26 are:

1. Agriculture as the first engine
2. MSMEs as the second engine
3. Investment as the third engine
4. Exports as the fourth engine

Reforms are mentioned as the fuel for these engines, and fiscal policy is also discussed as a key aspect of the budget....
--------------------------------------------------

Q: Key agriculture initiatives mentioned?
The budget document mentions several key agriculture initiatives, includin

In [46]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# ALL YOUR INSTALLED MODELS
models = {
    "llama2": OllamaLLM(model="llama2", temperature=0.1),
    "gemma3:270m": OllamaLLM(model="gemma3:270m", temperature=0.1),
    "gemma3:1b": OllamaLLM(model="gemma3:1b", temperature=0.1), 
    "qwen2:1.5b": OllamaLLM(model="qwen2:1.5b", temperature=0.1),
    "qwen:1.8b": OllamaLLM(model="qwen:1.8b", temperature=0.1),
    "qwen3-coder:480b-cloud": OllamaLLM(model="qwen3-coder:480b-cloud", temperature=0.1),
    "nomic-embed-text": OllamaLLM(model="nomic-embed-text", temperature=0.1)  # Testing embedding as LLM too
}

# Budget RAG template (same for all)
template = """You are analyzing India's Union Budget 2025-26. 
Use ONLY the following budget document context to answer.

Context from Budget documents:
{context}

Question: {question}

Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# Build ALL chains
chains = {}
for name, llm in models.items():
    chains[name] = (
        {"context": retriever, "question": lambda x: x}
        | prompt
        | llm
        | StrOutputParser()
    )

print("--> ALL 7 MODELS LOADED - FULL RAG COMPARISON!")
print("Models ready:", list(chains.keys()))

--> ALL 7 MODELS LOADED - FULL RAG COMPARISON!
Models ready: ['llama2', 'gemma3:270m', 'gemma3:1b', 'qwen2:1.5b', 'qwen:1.8b', 'qwen3-coder:480b-cloud', 'nomic-embed-text']


In [48]:
import time

# TESTING Models
question = "What is the fiscal deficit target for 2025-26?"
print("TIMED MODEL (Fiscal Deficit):\n")

results = {}
for name, chain in chains.items():
    start_time = time.time()
    print(f"---> Testing {name}...", end=" ")
    
    try:
        answer = chain.invoke(question)
        elapsed = time.time() - start_time
        results[name] = {"answer": answer, "time": f"{elapsed:.1f}s"}
        print(f"{elapsed:.1f}s")
        print("Response")  # ← YOUR EXACT WORDING
        print(answer[:250] + "..." if len(answer)>250 else answer)
        print()  # Empty line
    except Exception as e:
        elapsed = time.time() - start_time
        results[name] = {"answer": f"ERROR: {str(e)[:50]}", "time": f"{elapsed:.1f}s"}
        print(f"{elapsed:.1f}s")
        print("Response")
        print(f"ERROR: {str(e)[:50]}")
        print()

# RESULTS TABLE (still there for summary)
print("═"*100)
print("FINAL SUMMARY TABLE:")
print("═"*100)
print(f"{'Model':<20} | {'Time':<6} | {'Status'}")
print("─"*100)
for name, data in results.items():
    print(f"{name:<20} | {data['time']:<6} | {'OK' if 'ERROR' not in data['answer'] else '❌ ERROR'}")

TIMED MODEL (Fiscal Deficit):

---> Testing llama2... 12.0s
Response
The fiscal deficit target for 2025-26 is estimated to be 4.4% of GDP.

---> Testing gemma3:270m... 4.8s
Response
113.


---> Testing gemma3:1b... 3.3s
Response
4.4 per cent of GDP.

---> Testing qwen2:1.5b... 4.3s
Response
The fiscal deficit target for 2025-26 is estimated to be 4.4 per cent of GDP.

---> Testing qwen:1.8b... 5.3s
Response
The fiscal deficit target for 2025-26 is estimated at ` 34.96 lakh crore`. This target indicates that the government aims to reduce its fiscal deficit by a significant amount, which in this case is ` 34.96 lakh crore`.

Note: The given information pe...

---> Testing qwen3-coder:480b-cloud... 1.0s
Response
The fiscal deficit target for 2025-26 is estimated to be **4.4 per cent of GDP**.

---> Testing nomic-embed-text... 0.1s
Response
ERROR: "nomic-embed-text" does not support generate (stat

════════════════════════════════════════════════════════════════════════════════════════════