In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [None]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [150]:
# Replace Cell 2's Ollama with:
'''from langchain_openai import ChatOpenAI
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
llm = ChatOpenAI(model="gpt-3.5-turbo")  # Requires OPENAI_API_KEY from Cell 2'''

'from langchain_openai import ChatOpenAI\nos.environ[\'OPENAI_API_KEY\'] = os.getenv(\'OPENAI_API_KEY\')\nllm = ChatOpenAI(model="gpt-3.5-turbo")  # Requires OPENAI_API_KEY from Cell 2'

## Define our LLM

In [151]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
llm = Ollama(model="gemma3:1b")
llm.invoke("Tell me a joke about cats")

'Why did the cat sit on the computer? \n\nBecause it wanted to keep an eye on the mouse! 😄 \n\n---\n\nWould you like to hear another one? 😊'

## Process PDF document

### Load PDF document

In [152]:
loader = PyPDFLoader("sample_policy.pdf")
pages = loader.load()
pages

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-30T18:34:17+05:30', 'author': 'yathi yathish', 'moddate': '2025-07-30T18:34:17+05:30', 'source': 'sample_policy.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='1. Car Insurance Policy (Sample) \nPolicy Number: CAR2023XYZ123 \nInsured: John Doe \nCoverage Period: 01-Jan-2024 to 31-Dec-2024 \nVehicle Details: \n• Make: Toyota \n• Model: Camry 2022 \n• Registration: DL01AB1234 \nCoverage: \n• Third-Party Liability: ₹15,00,000 \n• Own Damage: ₹8,00,000 \n• Personal Accident Cover: ₹10,00,000 \nExclusions: \n• Damage due to war/nuclear risks \n• Driving under influence of alcohol \n \n2. Health Insurance Policy (Sample) \nPolicy Number: HEALTH2023XYZ456 \nInsured: Jane Smith \nCoverage Period: 01-Feb-2024 to 31-Jan-2025 \nSum Insured: ₹5,00,000 \nCoverage Includes: \n• Hospitalization expenses \n• Pre/post-hospitalization (30/60 days) \n• Day-care procedur

### Split document

In [172]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

chunks

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-30T18:34:17+05:30', 'author': 'yathi yathish', 'moddate': '2025-07-30T18:34:17+05:30', 'source': 'sample_policy.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='1. Car Insurance Policy (Sample) \nPolicy Number: CAR2023XYZ123 \nInsured: John Doe \nCoverage Period: 01-Jan-2024 to 31-Dec-2024 \nVehicle Details: \n• Make: Toyota \n• Model: Camry 2022 \n• Registration: DL01AB1234 \nCoverage: \n• Third-Party Liability: ₹15,00,000 \n• Own Damage: ₹8,00,000 \n• Personal Accident Cover: ₹10,00,000 \nExclusions: \n• Damage due to war/nuclear risks \n• Driving under influence of alcohol \n \n2. Health Insurance Policy (Sample) \nPolicy Number: HEALTH2023XYZ456 \nInsured: Jane Smith \nCoverage Period: 01-Feb-2024 to 31-Jan-2025 \nSum Insured: ₹5,00,000 \nCoverage Includes: \n• Hospitalization expenses \n• Pre/post-hospitalization (30/60 days) \n• Day-care procedur

### Create embeddings

In [257]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

embedding_function = get_embedding_function()
#print(embedding_function)
test_vector = embedding_function.embed_query("cat")
print(len(test_vector))

768


In [258]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

{'score': 0.3485219058930902}

In [259]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

{'score': 0.34429569728684584}

### Create vector database

In [260]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    #print(ids[:5])
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:
            print(unique_chunks)       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [261]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_test")

[]
[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-07-30T18:34:17+05:30', 'author': 'yathi yathish', 'moddate': '2025-07-30T18:34:17+05:30', 'source': 'sample_policy.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='1. Car Insurance Policy (Sample) \nPolicy Number: CAR2023XYZ123 \nInsured: John Doe \nCoverage Period: 01-Jan-2024 to 31-Dec-2024 \nVehicle Details: \n• Make: Toyota \n• Model: Camry 2022 \n• Registration: DL01AB1234 \nCoverage: \n• Third-Party Liability: ₹15,00,000 \n• Own Damage: ₹8,00,000 \n• Personal Accident Cover: ₹10,00,000 \nExclusions: \n• Damage due to war/nuclear risks \n• Driving under influence of alcohol \n \n2. Health Insurance Policy (Sample) \nPolicy Number: HEALTH2023XYZ456 \nInsured: Jane Smith \nCoverage Period: 01-Feb-2024 to 31-Jan-2025 \nSum Insured: ₹5,00,000 \nCoverage Includes: \n• Hospitalization expenses \n• Pre/post-hospitalization (30/60 days) \n• Day-care proce

## 2. Query for relevant data

In [285]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_test", embedding_function=embedding_function)
# Add after Cell 8
print(f"Vectorstore contains {vectorstore._collection.count()} documents")  # Should be > 0

Vectorstore contains 3 documents


In [288]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("what is car insurance policy ?")
relevant_chunks[:1]

[Document(metadata={'producer': 'Microsoft® Word 2021', 'page_label': '1', 'total_pages': 3, 'creator': 'Microsoft® Word 2021', 'page': 0, 'author': 'yathi yathish', 'creationdate': '2025-07-30T18:34:17+05:30', 'source': 'sample_policy.pdf', 'moddate': '2025-07-30T18:34:17+05:30'}, page_content='1. Car Insurance Policy (Sample) \nPolicy Number: CAR2023XYZ123 \nInsured: John Doe \nCoverage Period: 01-Jan-2024 to 31-Dec-2024 \nVehicle Details: \n• Make: Toyota \n• Model: Camry 2022 \n• Registration: DL01AB1234 \nCoverage: \n• Third-Party Liability: ₹15,00,000 \n• Own Damage: ₹8,00,000 \n• Personal Accident Cover: ₹10,00,000 \nExclusions: \n• Damage due to war/nuclear risks \n• Driving under influence of alcohol \n \n2. Health Insurance Policy (Sample) \nPolicy Number: HEALTH2023XYZ456 \nInsured: Jane Smith \nCoverage Period: 01-Feb-2024 to 31-Jan-2025 \nSum Insured: ₹5,00,000 \nCoverage Includes: \n• Hospitalization expenses \n• Pre/post-hospitalization (30/60 days) \n• Day-care procedur

In [294]:
import json

# Get relevant chunks
relevant_chunks = retriever.invoke("car insurance policy coverage details")

# Convert to JSON format
results = []
for chunk in relevant_chunks[:3]:  # Limit to 3 chunks
    results.append({
        "content": chunk.page_content,
        "metadata": {
            "page": chunk.metadata.get("page", None),
            "source": chunk.metadata.get("source", "unknown")
        }
    })

# Pretty print JSON
print(json.dumps(results, indent=2))

[
  {
    "content": "1. Car Insurance Policy (Sample) \nPolicy Number: CAR2023XYZ123 \nInsured: John Doe \nCoverage Period: 01-Jan-2024 to 31-Dec-2024 \nVehicle Details: \n\u2022 Make: Toyota \n\u2022 Model: Camry 2022 \n\u2022 Registration: DL01AB1234 \nCoverage: \n\u2022 Third-Party Liability: \u20b915,00,000 \n\u2022 Own Damage: \u20b98,00,000 \n\u2022 Personal Accident Cover: \u20b910,00,000 \nExclusions: \n\u2022 Damage due to war/nuclear risks \n\u2022 Driving under influence of alcohol \n \n2. Health Insurance Policy (Sample) \nPolicy Number: HEALTH2023XYZ456 \nInsured: Jane Smith \nCoverage Period: 01-Feb-2024 to 31-Jan-2025 \nSum Insured: \u20b95,00,000 \nCoverage Includes: \n\u2022 Hospitalization expenses \n\u2022 Pre/post-hospitalization (30/60 days) \n\u2022 Day-care procedures \nExclusions: \n\u2022 Cosmetic treatments \n\u2022 Pre-existing diseases (for first 2 years) \n \n3. Term Life Insurance Policy (Sample) \nPolicy Number: LIFE2023XYZ789 \nInsured: Robert Johnson \

In [295]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. 

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [296]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="what is main toopic of the document? ")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. 

1. Car Insurance Policy (Sample) 
Policy Number: CAR2023XYZ123 
Insured: John Doe 
Coverage Period: 01-Jan-2024 to 31-Dec-2024 
Vehicle Details: 
• Make: Toyota 
• Model: Camry 2022 
• Registration: DL01AB1234 
Coverage: 
• Third-Party Liability: ₹15,00,000 
• Own Damage: ₹8,00,000 
• Personal Accident Cover: ₹10,00,000 
Exclusions: 
• Damage due to war/nuclear risks 
• Driving under influence of alcohol 
 
2. Health Insurance Policy (Sample) 
Policy Number: HEALTH2023XYZ456 
Insured: Jane Smith 
Coverage Period: 01-Feb-2024 to 31-Jan-2025 
Sum Insured: ₹5,00,000 
Coverage Includes: 
• Hospitalization expenses 
• Pre/post-hospitalization (30/60 days) 
• Day-care procedures 
Exclusions: 
• Cosmetic treatments 
• Pre-existing diseases (for first 2 years) 
 
3. Term Life Insurance Policy (Sample) 
Policy Number: LIFE2023XYZ789 
Insured: Robert Johnson 
Coverag

In [297]:
llm.invoke(prompt)

'According to the provided samples, the main topic of the documents is **Insurance Policies**. \n\nThe context explicitly states: “The documents cover various types of insurance policies, including Car Insurance, Health Insurance, Term Life Insurance, Travel Insurance, and Home Insurance.”'

### Using Langchain Expression Language

In [298]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("tell about car insurence policy")

'Based on the provided context, here’s the information about the car insurance policy:\n\n**Car Insurance Policy**\n\n*   **Policy Number:** CAR2023XYZ123\n*   **Insured:** John Doe\n*   **Coverage:**\n    *   Third-Party Liability: ₹15,00,000\n    *   Own Damage: ₹8,00,000\n    *   Personal Accident Cover: ₹10,00,000\n\nThe policy covers ₹15,00,000 for third-party liability, ₹8,00,000 for own damage, and ₹10,00,000 for personal accident cover.'

### Generate structured responses

In [299]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

### Transform response into a dataframe