In [1]:
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os

print("Setup Ollama LLM and Embeddings")
print("Loading ALL PDFs")

  from .autonotebook import tqdm as notebook_tqdm


Setup Ollama LLM and Embeddings
Loading ALL PDFs


In [None]:
# Identify ALL corrupted PDFs and calculate percentages
import os
from pathlib import Path
from pypdf import PdfReader
import pandas as pd

pdf_dir = Path("../data/pdf")
all_pdfs = list(pdf_dir.glob("*.pdf"))
print(f"Total PDF files found: {len(all_pdfs)}")

corrupted_files = []
valid_files = []

for pdf_file in all_pdfs:
    try:
        with open(pdf_file, 'rb') as f:
            pdf_reader = PdfReader(f)
            # If we reach here, PDF is readable
            valid_files.append(pdf_file.name)
    except Exception:
        corrupted_files.append(pdf_file.name)

# Create results
print("\nCORRUPTED FILES LIST:")
for i, corrupt_file in enumerate(corrupted_files, 1):
    print(f"{i:3d}. {corrupt_file}")

print(f"\nSUMMARY:")
print(f"Total PDFs:         {len(all_pdfs):4d}")
print(f"Valid PDFs:         {len(valid_files):4d}")
print(f"Corrupted PDFs:     {len(corrupted_files):4d}")
print(f"Corruption rate:    {len(corrupted_files)/len(all_pdfs)*100:.1f}%")

# Save corrupted list to file
with open("corrupted_pdfs.txt", "w") as f:
    f.write("CORRUPTED PDF FILES:\n")
    for corrupt_file in corrupted_files:
        f.write(f"{corrupt_file}\n")
    
print("\nCorrupted files saved to: corrupted_pdfs.txt")

# Load only VALID PDFs
print("\nLoading VALID PDFs only...")
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

loader = DirectoryLoader(
    path="../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    silent_errors=True  # Skip corrupted files automatically
)

documents = loader.load()
print(f"Successfully loaded: {len(documents)} pages from valid PDFs")

Total PDF files found: 1076


parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
parsing for Object Streams
invalid pdf header: b'\r\n\r\n\r'
CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found
Ignoring wrong pointing object 1 65536 (offset 0)
Ignoring wrong pointing object 15 65536 (offset 0)
incorrect startxref pointer(3)
incorrect startxref pointer(3)
parsing for Object Streams
incorrect startxref pointer(3)
incorrect startxref pointer(4)
parsing for Object Streams
invalid pdf header: b'\n<!DO'
EOF marker not found
Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 35 65536 (offset 0)
Ignoring wrong pointing object 55 65536 (offset 0)
Ignoring wrong pointing object 74 65536 (offset 0)
Ignoring wrong pointing object 92 65536 (offset 0)
Ignoring wrong pointing object 112 65536 (offset 0)
Ignoring wrong pointing object 139 65536 (offset 0)
Ignoring wrong pointing object 1


CORRUPTED FILES LIST:
  1. 3P5D3UKXU2R6I2TK4OJSLL6LGIQJ4NY5.pdf
  2. 6HTC5FVAQW3DVHYRD7PVJGBBQS7GRZTL.pdf
  3. DEAHZFTA4CQDFDYMRX2NPJCKEHYPIK2Z.pdf
  4. MDQ4BAARW6OTVBNBQE7BACYBNCCTWQDO.pdf
  5. SW62D5RJMAPDJWHDMA5DLJWWLMSYZE26.pdf

SUMMARY:
Total PDFs:         1076
Valid PDFs:         1071
Corrupted PDFs:        5
Corruption rate:    0.5%

Corrupted files saved to: corrupted_pdfs.txt

Loading VALID PDFs only...


  1%|          | 12/1076 [00:04<08:43,  2.03it/s]parsing for Object Streams
  4%|▍         | 43/1076 [01:06<09:21,  1.84it/s]  incorrect startxref pointer(1)
parsing for Object Streams
Object 47 0 found
  5%|▌         | 54/1076 [01:09<05:53,  2.89it/s]parsing for Object Streams
  6%|▌         | 61/1076 [01:14<13:45,  1.23it/s]Unexpected escaped string: C
Unexpected escaped string: C
  6%|▌         | 63/1076 [01:15<10:28,  1.61it/s]invalid pdf header: b'\r\n\r\n\r'
CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found
Error loading file ..\data\pdf\3P5D3UKXU2R6I2TK4OJSLL6LGIQJ4NY5.pdf: Stream has ended unexpectedly
  7%|▋         | 75/1076 [02:03<1:17:00,  4.62s/it]Ignoring wrong pointing object 1 65536 (offset 0)
Ignoring wrong pointing object 15 65536 (offset 0)
  8%|▊         | 87/1076 [02:07<16:38,  1.01s/it]  Error loading file ..\data\pdf\4GJGAIUVBMLM3W7O5SV4EKDNKC4DVOCL.pdf: File has not been decrypte

Successfully loaded: 13380 pages from valid PDFs





* Total PDFs:     1076 files (100.0%)
* Fully Valid:    1062 files (98.7%) 
* Problematic:     14 files ( 1.3%)
* Successfully loaded pages: 13011 pages

In [4]:
# Split documents with complete metadata preservation

print("Processing 13,011 pages into smart chunks...")

# Ensure all documents have proper metadata
for doc in documents:
    # PyPDFLoader usually adds these, but ensure they're present
    if 'page' not in doc.metadata:
        doc.metadata['page'] = 1
    if 'source' not in doc.metadata:
        doc.metadata['source'] = 'unknown.pdf'

# Intelligent text splitter with document structure awareness
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,           # Optimal size for RAG
    chunk_overlap=200,         # Context preservation
    length_function=len,
    add_start_index=True,      # Track exact position in original doc
    separators=["\n\n", "\n", ". ", " ", ""],  # Paragraphs > Sentences > Words
)

chunks = text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunks)}")

# Verify metadata preservation (first 3 chunks)
print("\nMetadata verification (sample chunks):")
for i in range(min(3, len(chunks))):
    chunk = chunks[i]
    print(f"Chunk {i+1}:")
    print(f"  Filename: {Path(chunk.metadata['source']).name}")
    print(f"  Page: {chunk.metadata['page']}")
    print(f"  Start position: {chunk.metadata.get('start_index', 'N/A')}")
    print(f"  Preview: {chunk.page_content[:150]}...")
    print()

# Create persistent Chroma vector database
print("Generating embeddings and indexing...")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db",
    collection_name="pdf_documents"
)

print(f"Vector database complete:")
print(f"- {len(chunks):,} total chunks indexed")
print(f"- Average chunk size: {sum(len(c.page_content) for c in chunks)//len(chunks)} characters")
print(f"- Storage location: ./chroma_db/")
print(f"- Ready for RAG queries")

Processing 13,011 pages into smart chunks...
Total chunks created: 41781

Metadata verification (sample chunks):
Chunk 1:
  Filename: 08036c5a50a93da84c5c45ba468c58159d75281e.pdf
  Page: 0
  Start position: 0
  Preview: THE CENTRE FOR HUMANITARIAN DATA  
 1
DECEMBER 2020
1     Because there are well-established and accepted standards and mechanisms for sharing financi...

Chunk 2:
  Filename: 08036c5a50a93da84c5c45ba468c58159d75281e.pdf
  Page: 0
  Start position: 775
  Preview: risks for crisis-affected people, humanitarian organizations and donors.
• Donors regularly request data from the organizations they fund in order to ...

Chunk 3:
  Filename: 08036c5a50a93da84c5c45ba468c58159d75281e.pdf
  Page: 0
  Start position: 1480
  Preview: limitation.
• Donors and humanitarian organizations can take the following steps to minimize risks while 
maximizing benefits when sharing sensitive d...

Generating embeddings and indexing...
Vector database complete:
- 41,781 total chunks indexed
- 

In [5]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db",
    collection_name="pdf_documents"
)

KeyboardInterrupt: 

In [6]:
# Load ChromaDB and test retrieval
print("Loading vector database for queries...")

# Load your created database
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embeddings,
    collection_name="pdf_documents"
)

# Test retrieval 
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)

# Test query
query = "Graduation rates of Students with disabilities in higher education institutions"
relevant_docs = retriever.invoke(query)

print(f"Found {len(relevant_docs)} relevant chunks:")
for i, doc in enumerate(relevant_docs):
    print(f"\nChunk {i+1}:")
    print(f"  Source: {Path(doc.metadata['source']).name}")
    print(f"  Page: {doc.metadata['page']}")
    print(f"  Content: {doc.page_content[:200]}...")

print("Ollama LLM and Embeddings setup complete.")
print("Initializing Ollama LLM and Embeddings...")

Loading vector database for queries...
Found 5 relevant chunks:

Chunk 1:
  Source: GGLQFBLQF5BLQKGOXFHVYVU6CRHUFVLU.pdf
  Page: 0
  Content: District enrollment (public and nonpublic school-age students ? with and without disabilities) on the first Wednesday 
in October 1,179
Special education classification rate 11%
Enrollment of preschoo...

Chunk 2:
  Source: GGLQFBLQF5BLQKGOXFHVYVU6CRHUFVLU.pdf
  Page: 0
  Content: District enrollment (public and nonpublic school-age students ? with and without disabilities) on the first Wednesday 
in October 1,179
Special education classification rate 11%
Enrollment of preschoo...

Chunk 3:
  Source: MNMNOYBE6LSE2QDETWHNV3TQWBUOYADX.pdf
  Page: 20
  Content: |                                                                                                                                                   |
| With a disability:                              ...

Chunk 4:
  Source: MNMNOYBE6LSE2QDETWHNV3TQWBUOYADX.pdf
  Page: 20
  Content: |         

In [4]:
# Test RAG with specific document questions
test_questions = [
    "What is the total Farm Bill investment amount mentioned for New Mexico projects?",
    "What irrigation improvements does Mike Sporcic recommend in the Holistic Irrigation Technology program?",
    "How many wild chinook salmon parr were PIT tagged in Idaho streams during July and August 2001?",
    "What was the average parr-to-smolt survival rate to Lower Granite Dam for Idaho and Oregon streams in 2002?",
    "What organization did Rep. John P. Murtha praise for economic development in Cambria and Somerset counties?"
    "How many people attended the JARI annual report meeting on March 27, 2008?",
    "How many organizations populate the HDX Data Grids with reference to 125 data sources?",
    "What percentage of relevant, complete crisis data is available across 27 humanitarian operations locations?",
    "What is the data completeness percentage for Afghanistan and Central African Republic (CAR)?",
    "How many times more are Data Grids datasets downloaded compared to the average HDX dataset?"
]

**LLama Model Testing**

In [None]:
# context_4k_limit_prompt - Perplexity-style for 4K Llama

from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Your llama2 model for answering
llm = OllamaLLM(model="llama2")

context_4k_limit_prompt = """
You are analyzing a dataset of multiple PDF documents.

INSTRUCTIONS:
1. Answer using ONLY the provided context - NO external knowledge  
2. Cite sources inline with [1], [2], etc. after each sentence/paragraph
3. List ONLY relevant source filenames at bottom

CONTEXT (from PDF documents):
{context}

QUESTION: {question}

RULES:
- Use [1], [2] etc. immediately after sentences using that chunk
- Extract exact numbers/dates/names from context only
- If not in context: "Not found in provided documents"
- Keep answer concise (under 200 words)

FORMAT REQUIRED:
Answer using ONLY the context above.

Sources:
[1] filename1.pdf
[2] filename2.pdf

Answer:"""

# Format function for citations
def format_docs(docs):
    formatted = []
    for i, doc in enumerate(docs, 1):
        source_name = Path(doc.metadata['source']).name
        formatted.append(f"[{i}] {doc.page_content}\nSource: {source_name} (page {doc.metadata['page']})")
    return "\n\n".join(formatted)

# RAG chain with your prompt
rag_chain_4k = (
    {"context": retriever | format_docs, "question": lambda x: x}
    | ChatPromptTemplate.from_template(context_4k_limit_prompt)
    | llm
    | StrOutputParser()
)

print("Testing RAG with document-specific questions: Llama 4K context limit model")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'langchain_core.chains'

In [None]:
# Testing RAG with timing + results storage
import time
from pathlib import Path

# Dictionary to store results for model comparison
model_results = {
    "model": "llama3.2_4k",
    "test_questions": test_questions.copy(),
    "times": [],
    "answers": [],
    "avg_time": None
}

print("Testing RAG performance (Llama 4K context limit)...")
total_time = 0

for i, q in enumerate(test_questions, 1):
    start_time = time.perf_counter()
    
    answer = rag_chain_4k.invoke(q)
    
    end_time = time.perf_counter()
    question_time = end_time - start_time
    
    total_time += question_time
    
    model_results["times"].append(question_time)
    model_results["answers"].append(answer)
    
    print(f"\nQ{i}: {q}")
    print(f"Time: {question_time:.3f}s")
    print(f"Answer: {answer}")
    print("-" * 70)

# Calculate average
avg_time = total_time / len(test_questions)
model_results["avg_time"] = avg_time

print(f"\n---> SUMMARY (Llama 4K):")
print(f"Average response time: {avg_time:.3f} seconds")
print(f"Total time for {len(test_questions)} questions: {total_time:.3f} seconds")
print(f"Individual times: {[f'{t:.3f}s' for t in model_results['times']]}")

# Save for later model comparison
print(f"\nResults saved to model_results dictionary")
print(f"Ready for 32K model comparison...")

Testing RAG performance (Llama 4K context limit)...

Q1: What is the total Farm Bill investment amount mentioned for New Mexico projects?
Time: 6.605s
Answer: The total Farm Bill investment amount mentioned for New Mexico projects is not found in the provided documents.
----------------------------------------------------------------------

Q2: What irrigation improvements does Mike Sporcic recommend in the Holistic Irrigation Technology program?
Time: 13.868s
Answer: Mike Sporcic recommends several irrigation improvements in the Holistic Irrigation Technology program, including:

* Installing high flow structures to improve irrigation efficiency by uniformly applying water with less time and labor (page 3)
* Laser leveling to increase irrigation application uniformity and overall efficiency (page 3)
* Measuring soil moisture to identify appropriate times for irrigation, preventing plant stress conditions, and improving crop yields and profit margins (page 3)

These improvements are ex

In [None]:
# Your llama2 model for answering
qwen_llm = OllamaLLM(model="qwen2.5:1.5b")

In [18]:
print(model_results)

{'model': 'llama3.2_4k', 'test_questions': ['What is the total Farm Bill investment amount mentioned for New Mexico projects?', 'What irrigation improvements does Mike Sporcic recommend in the Holistic Irrigation Technology program?', 'How many wild chinook salmon parr were PIT tagged in Idaho streams during July and August 2001?', 'What was the average parr-to-smolt survival rate to Lower Granite Dam for Idaho and Oregon streams in 2002?', 'What organization did Rep. John P. Murtha praise for economic development in Cambria and Somerset counties?How many people attended the JARI annual report meeting on March 27, 2008?', 'How many organizations populate the HDX Data Grids with reference to 125 data sources?', 'What percentage of relevant, complete crisis data is available across 27 humanitarian operations locations?', 'What is the data completeness percentage for Afghanistan and Central African Republic (CAR)?', 'How many times more are Data Grids datasets downloaded compared to the a

In [10]:
# context_32k_limit_prompt - Enhanced for 32K context window

# Qwen model for answering
qwen_llm = OllamaLLM(model="qwen2.5:1.5b")

context_32k_limit_prompt = """
You are an expert research assistant analyzing a comprehensive dataset of multiple PDF documents.

IMPORTANT INSTRUCTIONS:
1. Answer using ONLY the provided context below - NO external knowledge or training data
2. This dataset contains diverse technical, scientific, and professional PDF documents
3. Cite sources inline with [1], [2], etc. after each relevant sentence/paragraph
4. At the end, list ONLY the source filenames used (no full paths)

EXTENDED CONTEXT (from multiple PDF documents):
{context}

QUESTION: {question}

ANSWERING RULES:
- Use [1], [2], etc. immediately after sentences referencing specific chunks
- Extract precise numbers, dates, names, percentages from context only
- Provide comprehensive answers using all relevant context available
- If information not found: "Not found in provided documents"
- Maintain factual accuracy - quote directly when possible

REQUIRED OUTPUT FORMAT:

Answer your question using ONLY the context above. Be thorough but concise.

Sources used:
[1] filename1.pdf
[2] filename2.pdf
[3] filename3.pdf
etc.

Answer:"""

# RAG chain for 32K model
rag_chain_32k = (
    {"context": retriever | format_docs, "question": lambda x: x}
    | ChatPromptTemplate.from_template(context_32k_limit_prompt)
    | qwen_llm  # Your 32K model
    | StrOutputParser()
)
print("Testing RAG with document-specific questions: Qwen 32K context limit model")

Testing RAG with document-specific questions: Qwen 32K context limit model


In [None]:
# Dictionary to store results for model comparison
model_results = {
    "model": "qwen2.5:1.5b",
    "test_questions": test_questions.copy(),
    "times": [],
    "answers": [],
    "avg_time": None
}

print("Testing RAG performance (qwen2.5:1.5b 32K context limit)...")
total_time = 0

for i, q in enumerate(test_questions, 1):
    start_time = time.perf_counter()
    
    answer = rag_chain_32k.invoke(q)
    
    end_time = time.perf_counter()
    question_time = end_time - start_time
    
    total_time += question_time
    
    model_results["times"].append(question_time)
    model_results["answers"].append(answer)
    
    print(f"\nQ{i}: {q}")
    print(f"Time: {question_time:.3f}s")
    print(f"Answer: {answer}")
    print("-" * 70)

# Calculate average
avg_time = total_time / len(test_questions)
model_results["avg_time"] = avg_time

print(f"\n---> SUMMARY (qwen2.5:1.5b 3K):")
print(f"Average response time: {avg_time:.3f} seconds")
print(f"Total time for {len(test_questions)} questions: {total_time:.3f} seconds")
print(f"Individual times: {[f'{t:.3f}s' for t in model_results['times']]}")

# Save for later model comparison
print(f"\nResults saved to model_results dictionary")
print(f"Ready for 32K model comparison...")

Testing RAG performance (qwen2.5:1.5b 32K context limit)...

Q1: What is the total Farm Bill investment amount mentioned for New Mexico projects?
Time: 4.333s
Answer: Not found in provided documents.
----------------------------------------------------------------------

Q2: What irrigation improvements does Mike Sporcic recommend in the Holistic Irrigation Technology program?
Time: 1.669s
Answer: Mike Sporcic, a conservation advocate with ideas, recommends several irrigation improvements in the Holistic Irrigation Technology (HIT) program. The recommended solutions include high flow turnouts, laser leveling, and measuring soil moisture to identify appropriate times for irrigating crops to prevent plant stress conditions and improve crop yields. These practices reduce evaporation of water, thereby conserving resources.

Sources used:
[1] 2WE4HZPD57KVPL4RQKCB4PUG42SQ2RMJ.pdf
[2] 2WE4HZPD57KVPL4RQKCB4PUG42SQ2RMJ.pdf
[3] 2WE4HZPD57KVPL4RQKCB4PUG42SQ2RMJ.pdf

NOT FOUND IN PROVIDED DOCUMENT

In [27]:
# Testing RAG with Gemma 32K model

gemma_llm = OllamaLLM(model="gemma3:1b")

# RAG chain for 32K gemma model
rag_chain_gemma_32k = (
    {"context": retriever | format_docs, "question": lambda x: x}
    | ChatPromptTemplate.from_template(context_32k_limit_prompt)
    | gemma_llm  # Your 32K model
    | StrOutputParser()
)
print("Testing RAG with document-specific questions: Gemma 32K context limit model")

Testing RAG with document-specific questions: Gemma 32K context limit model


In [29]:
# Dictionary to store results for model comparison
model_results = {
    "model": "gemma3:1b",
    "test_questions": test_questions.copy(),
    "times": [],
    "answers": [],
    "avg_time": None
}

# Cell: RAG with Retrieved Documents Display (Production Version)
def test_rag_with_sources(model_name, llm_model, rag_chain):
    print(f"\n{'='*80}")
    print(f"Testing: {model_name}")
    print(f"{'='*80}")
    
    model_results = {
        "model": model_name,
        "times": [],
        "answers": [],
        "retrieved_docs": [],
        "avg_time": None
    }
    
    total_time = 0
    
    for i, question in enumerate(test_questions, 1):
        print(f"\nQ{i}: {question}")
        
        # Retrieve documents FIRST (before LLM)
        start_retrieve = time.perf_counter()
        retrieved_docs = retriever.invoke(question)
        retrieve_time = time.perf_counter() - start_retrieve
        
        # Show retrieved documents with FULL metadata
        print(f"\nRETRIEVED DOCUMENTS ({len(retrieved_docs)}):")
        for j, doc in enumerate(retrieved_docs[:3], 1):  # Top 3 docs
            print(f"  [{j}] {Path(doc.metadata['source']).name}")
            print(f"     Page: {doc.metadata['page']}")
            print(f"     Preview: {doc.page_content[:150]}...")
            print()
        
        # Generate answer
        start_answer = time.perf_counter()
        answer = rag_chain.invoke(question)
        answer_time = time.perf_counter() - start_answer
        
        total_time += (retrieve_time + answer_time)
        model_results["times"].append(retrieve_time + answer_time)
        model_results["answers"].append(answer)
        model_results["retrieved_docs"].append([
            {"filename": Path(d.metadata['source']).name, "page": d.metadata['page'], "preview": d.page_content[:100]} 
            for d in retrieved_docs[:3]
        ])
        
        print(f"Retrieve: {retrieve_time:.3f}s | Answer: {answer_time:.3f}s | Total: {retrieve_time+answer_time:.3f}s")
        print(f"Answer: {answer}")
        print("-" * 80)
    
    # Summary
    avg_time = total_time / len(test_questions)
    model_results["avg_time"] = avg_time
    print(f"\nSUMMARY {model_name}:")
    print(f"Avg total time: {avg_time:.3f}s | Retrieve+Answer breakdown")
    
    return model_results

# Test Gemma3:1b with full transparency
gemma_results = test_rag_with_sources("gemma3:1b", gemma_llm, rag_chain_gemma_32k)


Testing: gemma3:1b

Q1: What is the total Farm Bill investment amount mentioned for New Mexico projects?

RETRIEVED DOCUMENTS (5):
  [1] KV25SHM4P3UZHVNNEAE3OZSJ23D3BXZH.pdf
     Page: 0
     Preview: in the Enterprise Roadmap. Include a description of how the investment includes or
will achieve programmatic or technical innovation. 
 MIDAS is align...

  [2] V7S4LR4S6BF5I3AFUOXHPTZC4RMCD6ZE.pdf
     Page: 9
     Preview: Exhibit 300: FBI Data Integration and Visualization System (DIVS) (Revision 2) 
Page 10 of 13 
5. Technical Reference Model (TRM) Table: 
To demonstra...

  [3] V7S4LR4S6BF5I3AFUOXHPTZC4RMCD6ZE.pdf
     Page: 8
     Preview: percentages in the column can, but are not required to, add up to 100%. 
 
5. Technical Reference Model (TRM) Table: 
To demonstrate how this major IT...

Retrieve: 0.976s | Answer: 3.709s | Total: 4.685s
Answer: Not found in provided documents
--------------------------------------------------------------------------------

Q2: What irrigation i

### **Results**

| Model | Avg Time | Model Last Update |
|-------|----------|----------|
| Gemma3:1b | 2.019s | 9 Months Ago |
| Qwen2.5:1.5b | 1.735s | 1 Year Ago |
| Llama2 | 12.5s | 56% | 1 Year Ago

## LangChain ColPali Integration

In [None]:
import torch
from pdf2image import convert_from_path
from colpali_engine.models import ColQwen2, ColQwen2Processor
from chromadb import Client
import numpy as np

# Run on CPU (avoids offload issues)
model = ColQwen2.from_pretrained(
    "vidore/colqwen2-v0.1", 
    device_map="cpu",
    torch_dtype=torch.float32
)
processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v0.1")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.74it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [None]:
from chromadb import PersistentClient

# FIXED ChromaDB setup
chroma_client = PersistentClient(path="./chroma_colpali_db")
collection = chroma_client.get_or_create_collection("kaggle_colpali_pages")

In [8]:
from chromadb import PersistentClient
from tqdm import tqdm
from pathlib import Path
import numpy as np

pdf_dir = Path("../data/pdf")
all_pdf_paths = list(pdf_dir.glob("*.pdf"))
print(f"Found {len(all_pdf_paths)} Kaggle PDFs")

chroma_client = PersistentClient(path="./chroma_colpali_db")
collection = chroma_client.get_or_create_collection("visual_pages")

batch_size = 5
for batch_start in range(0, len(all_pdf_paths[:20]), batch_size):
    batch_paths = all_pdf_paths[batch_start:batch_start+batch_size]
    batch_embeddings = []
    batch_ids = []
    batch_metadata = []
    
    print(f"\nBatch {batch_start//batch_size + 1}/4")
    
    for pdf_path in batch_paths:
        print(f"Processing {pdf_path.name}")
        pages = convert_from_path(
            str(pdf_path), 
            dpi=224,
            poppler_path=r"C:\poppler\poppler-25.12.0\Library\bin"
        )
        
        for i, page_img in enumerate(pages[:2]):
            print(f"  Page {i+1}", end=" ")
            batch = processor.process_images([page_img]).to("cpu")
            
            with torch.no_grad():
                outputs = model(**batch)
                emb = outputs.mean(dim=1).cpu().numpy()[0]  # FIXED
                batch_embeddings.append(emb.tolist())
            
            print("OK")
            batch_ids.append(f"{pdf_path.stem}_p{i+1}")
            batch_metadata.append({
                "pdf": pdf_path.name,
                "page": i+1
            })
    
    if batch_embeddings:
        collection.add(embeddings=batch_embeddings, ids=batch_ids, metadatas=batch_metadata)
        print(f"Batch complete: {len(batch_embeddings)} pages indexed")

print(f"Final: {collection.count()} pages in ChromaDB")

Found 1076 Kaggle PDFs

Batch 1/4
Processing 08036c5a50a93da84c5c45ba468c58159d75281e.pdf
  Page 1 OK
  Page 2 OK
Processing 0a29925ccc5e6299e132a73325956a3abef6dd26.pdf
  Page 1 OK
  Page 2 OK
Processing 0e21835a42a6df2405496f62647058ff855743c1.pdf
  Page 1 OK
  Page 2 OK
Processing 11613a97cef51ad28635fdd86915e74d94cff227.pdf
  Page 1 OK
  Page 2 OK
Processing 12851f0053449570257ff3dfe552621a8dd63d53.pdf
  Page 1 OK
  Page 2 OK
Batch complete: 10 pages indexed

Batch 2/4
Processing 17815074de3c9f8af9a5051978a72e2a83f1d18d.pdf
  Page 1 OK
  Page 2 OK
Processing 183c0f6a2a452dabc05e8eeba962c7b49ad10a62.pdf
  Page 1 OK
  Page 2 OK
Processing 1dcf57a5007b56254583423ba31107d22459bccf.pdf
  Page 1 OK
  Page 2 OK
Processing 1e8c2332a461a3a142840fa477fa907c66c35dac.pdf
  Page 1 OK
  Page 2 OK
Processing 201ab28823c4a10ca391e800642e9cf381737ed2.pdf
  Page 1 OK
  Page 2 OK
Batch complete: 10 pages indexed

Batch 3/4
Processing 22ZOCVPAF2GSGXR357RM7UT4Z22RS2LH.pdf
  Page 1 OK
Processing 23JCEIK

In [10]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

PyTorch: 2.5.1+cu121
CUDA: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU


## Indexing the Pages as Images with GPU.

In [None]:
print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
print(f"Used: {torch.cuda.memory_allocated()/1e9:.1f}GB")

Total VRAM: 6.4GB
Used: 8.6GB


: 

In [1]:
import torch
print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
print(f"Used: {torch.cuda.memory_allocated()/1e9:.1f}GB")  # MUST be ~0.0GB
print(f"GPU: {torch.cuda.get_device_name(0)}")

Total VRAM: 6.4GB
Used: 0.0GB
GPU: NVIDIA GeForce RTX 4050 Laptop GPU


# Colpali Testing

In [2]:
from colpali_engine.models import ColQwen2, ColQwen2Processor
import torch

# Load ColQwen2 for querying (same as indexing)
model = ColQwen2.from_pretrained(
    "vidore/colqwen2-v0.1",
    device_map="cpu",
    torch_dtype=torch.float16
)
processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v0.1")

# Create custom embedding function
class ColQwen2Embeddings:
    def embed_query(self, text):
        # Convert text to image (or use text encoder)
        inputs = processor.process_images([text_to_image(text)])  # Adjust
        with torch.no_grad():
            emb = model(**inputs).mean(dim=1).numpy()[0]
        return emb.tolist()
    
    def embed_documents(self, texts):
        return [self.embed_query(t) for t in texts]

# Use with ChromaDB
embeddings = ColQwen2Embeddings()
vectorstore = Chroma(
    persist_directory="chroma_colqwen2_db",
    embedding_function=embeddings,
    collection_name="visual_pages"
)

`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.77s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [4]:
import torch
from colpali_engine.models import ColQwen2, ColQwen2Processor

# Load model
model = ColQwen2.from_pretrained(
    "vidore/colqwen2-v0.1",
    device_map="cpu",
    torch_dtype=torch.float16
)
processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v0.1")

class ColQwen2Embeddings:
    def embed_query(self, text):
        """Embed text query using ColQwen2's text encoder"""
        # Use processor to encode text (not images)
        inputs = processor.process_queries([text]).to('cpu')
        
        with torch.no_grad():
            emb = model(**inputs).mean(dim=1).cpu().numpy()[0]
        
        return emb.tolist()
    
    def embed_documents(self, texts):
        return [self.embed_query(t) for t in texts]

# Use with ChromaDB
embeddings = ColQwen2Embeddings()
vectorstore = Chroma(
    persist_directory="chroma_colqwen2_db",
    embedding_function=embeddings,
    collection_name="visual_pages"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

Fetching 2 files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.29s/it]


In [8]:
# context_32k_limit_prompt - Enhanced for 32K context window

# Qwen model for answering
qwen_llm = OllamaLLM(model="qwen2.5:1.5b")

context_32k_limit_prompt = """
You are an expert research assistant analyzing a comprehensive dataset of multiple PDF documents.

IMPORTANT INSTRUCTIONS:
1. Answer using ONLY the provided context below - NO external knowledge or training data
2. This dataset contains diverse technical, scientific, and professional PDF documents
3. Cite sources inline with [1], [2], etc. after each relevant sentence/paragraph
4. At the end, list ONLY the source filenames used (no full paths)

EXTENDED CONTEXT (from multiple PDF documents):
{context}

QUESTION: {question}

ANSWERING RULES:
- Use [1], [2], etc. immediately after sentences referencing specific chunks
- Extract precise numbers, dates, names, percentages from context only
- Provide comprehensive answers using all relevant context available
- If information not found: "Not found in provided documents"
- Maintain factual accuracy - quote directly when possible

REQUIRED OUTPUT FORMAT:

Answer your question using ONLY the context above. Be thorough but concise.

Sources used:
[1] filename1.pdf
[2] filename2.pdf
[3] filename3.pdf
etc.

Answer:"""

# RAG chain for 32K model
rag_chain_32k = (
    {"context": retriever | format_docs, "question": lambda x: x}
    | ChatPromptTemplate.from_template(context_32k_limit_prompt)
    | qwen_llm  # Your 32K model
    | StrOutputParser()
)
print("Testing RAG with document-specific questions: Qwen 32K context limit model")

Testing RAG with document-specific questions: Qwen 32K context limit model


In [11]:
test_questions = [
    '''
        BOSTON - APPLES 
        RED DELICIOUS 
        cartons cell pack 
        U.S. Fcy
    what is this mean?
    ''',
    "How does the Medicare-Approved Drug Discount Card program help people with Medicare save on prescription drug costs, and what key steps should be followed to compare and enroll in the most suitable card?",
    "What is the recommended dosage, route of administration, and indication for NAXCEL Sterile Powder in cattle?",
    "What is the established pre-slaughter withdrawal period for NAXCEL Sterile Powder in cattle, and what are the safe residue concentrations for human food safety?",
    "(***). Who is the filing officer responsible for the Statement of Economic Interests Form 700 Non-Filer Enforcement Referral, and which agency do they represent?",
    "Based only on the figure legends in Figures 1–4, which colors or visual markers correspond to GHO countries and which correspond to HRP countries?"
]

In [13]:
import time
# Dictionary to store results for model comparison
model_results = {
    "model": "qwen2.5:1.5b",
    "test_questions": test_questions.copy(),
    "times": [],
    "answers": [],
    "avg_time": None
}

print("Testing RAG performance (qwen2.5:1.5b 32K context limit)...")
total_time = 0

for i, q in enumerate(test_questions, 1):
    start_time = time.perf_counter()
    
    answer = rag_chain_32k.invoke(q)
    
    end_time = time.perf_counter()
    question_time = end_time - start_time
    
    total_time += question_time
    
    model_results["times"].append(question_time)
    model_results["answers"].append(answer)
    
    print(f"\nQ{i}: {q}")
    print(f"Time: {question_time:.3f}s")
    print(f"Answer: {answer}")
    print("-" * 70)

# Calculate average
avg_time = total_time / len(test_questions)
model_results["avg_time"] = avg_time

print(f"\n---> SUMMARY (qwen2.5:1.5b 3K):")
print(f"Average response time: {avg_time:.3f} seconds")
print(f"Total time for {len(test_questions)} questions: {total_time:.3f} seconds")
print(f"Individual times: {[f'{t:.3f}s' for t in model_results['times']]}")

# Save for later model comparison
print(f"\nResults saved to model_results dictionary")
print(f"Ready for 32K model comparison...")

Testing RAG performance (qwen2.5:1.5b 32K context limit)...

Q1: 
        BOSTON - APPLES 
        RED DELICIOUS 
        cartons cell pack 
        U.S. Fcy
    what is this mean?
    
Time: 69.372s
Answer: In the provided context, there is no clear definition or explanation of what "BOSTON - APPLES - RED DELICIOUS - cartons cell pack U. S. Fcy" means. The sentence appears to be a list without additional information that would clarify its meaning or significance.

Sources used:
[1] filename1.pdf
[2] filename2.pdf
[3] filename3.pdf

Note: The source filenames provided in the instruction did not match any of the PDF documents mentioned, so no actual sources were cited.
----------------------------------------------------------------------

Q2: How does the Medicare-Approved Drug Discount Card program help people with Medicare save on prescription drug costs, and what key steps should be followed to compare and enroll in the most suitable card?
Time: 65.097s
Answer: The Medicare-Approved