In [1]:
import os
from PyPDF2 import PdfReader

def extract_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    text_by_page = []

    for i in range(num_pages):
        page = reader.pages[i]
        text = page.extract_text()
        if text:
            text_by_page.append({
                "pdf_name": os.path.basename(pdf_path),
                "page_number": i + 1,
                "text": text
            })
    
    return text_by_page

# Paths to your 5 datasets
pdf_files = [
    r'C:\Users\HONER\Downloads\Dataset1.pdf',
    r'C:\Users\HONER\Downloads\Document2.pdf',
    r'C:\Users\HONER\Downloads\Document3.pdf',
    r'C:\Users\HONER\Downloads\Document4.pdf',
    r'C:\Users\HONER\Downloads\Document5.pdf',
    r'C:\Users\HONER\Downloads\Arogya Sanjeevani Policy - CIN - U10200WB1906GOI001713 1.pdf'
]

# Extract text from all five PDFs
all_text = []
for pdf_path in pdf_files:
    extracted = extract_pdf_text(pdf_path)
    all_text.extend(extracted)

# ✅ Display a preview of each page's extracted text
for doc in all_text[:6]:  # Show only first 5 pages for preview
    print(f"{doc['pdf_name']} | Page {doc['page_number']}")
    print(doc['text'][:500])  # Show first 500 characters
    print("-" * 50)


Dataset1.pdf | Page 1
    
 
   
 
UIN- BAJHLIP23020V012223                                  Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance C o. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113  
For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or  
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (T oll Free No.)  
Issuing Office:  
 GLOBAL HEALTH CARE  
  
Policy Wordings  
 
UIN- 
--------------------------------------------------
Dataset1.pdf | Page 2
    
 
   
 
UIN- BAJHLIP23020V012223                                  Global Health Care/ Policy Wordings/Page 2 
 
 
Bajaj Allianz General Insurance C o. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113  
For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or  
Call at: Sales - 1800 209 0144 / Service - 1800 

In [2]:
import re

def clean_text(text):
    """
    Clean the extracted text:
    - Normalize spaces and bullets
    - Remove unnecessary newlines
    """
    text = re.sub(r'\n+', ' ', text)                      # Remove extra line breaks
    text = re.sub(r'\s+', ' ', text)                      # Normalize whitespaces
    text = re.sub(r'[•▪‣-]', '-', text)                   # Normalize bullets
    text = text.strip()                                   # Trim leading/trailing spaces
    return text

def preprocess_documents(documents):
    """
    Applies `clean_text` to every document entry.
    Input: List of dicts from PDF extraction
    Output: List of cleaned text with metadata
    """
    cleaned_docs = []
    for doc in documents:
        cleaned = clean_text(doc["text"])
        cleaned_docs.append({
            "pdf_name": doc["pdf_name"],
            "page_number": doc["page_number"],
            "cleaned_text": cleaned
        })
    return cleaned_docs


In [3]:
preprocessed_data = preprocess_documents(all_text)

# Preview first 3 cleaned documents
for doc in preprocessed_data[:3]:
    print(f"{doc['pdf_name']} | Page {doc['page_number']}")
    print(doc['cleaned_text'][:500])  # Show first 500 characters
    print("-" * 50)


Dataset1.pdf | Page 1
UIN- BAJHLIP23020V012223 Global Health Care/ Policy Wordings/Page 1 Bajaj Allianz General Insurance C o. Ltd. Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (T oll Free No.) Issuing Office: GLOBAL HEALTH CARE Policy Wordings UIN- BAJHLIP23020V012223 SECTION A) PREAMBLE Whereas the Insured described in the Policy Schedule
--------------------------------------------------
Dataset1.pdf | Page 2
UIN- BAJHLIP23020V012223 Global Health Care/ Policy Wordings/Page 2 Bajaj Allianz General Insurance C o. Ltd. Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (T oll Free No.) Issuing Office: GLOBAL HEALTH CARE 5. Cashless Facil

In [17]:
def chunk_text(text, chunk_size=500, overlap=50):
    """
    Splits text into overlapping chunks of tokens (words).
    """
    tokens = text.split()  # split by words
    chunks = []
    start = 0

    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # step forward with overlap

    return chunks

def chunk_documents(preprocessed_docs):
    """
    Chunks each cleaned document into smaller pieces.
    """
    chunked_data = []

    for doc in preprocessed_docs:
        chunks = chunk_text(doc["cleaned_text"])
        for i, chunk in enumerate(chunks):
            chunked_data.append({
                "pdf_name": doc["pdf_name"],
                "page_number": doc["page_number"],
                "chunk_index": i,
                "text": chunk
            })


In [18]:
def chunk_documents(preprocessed_docs):
    chunked_data = []
    for doc in preprocessed_docs:
        if not doc["cleaned_text"]:
            print(f"⚠️ Skipping empty text from: {doc['pdf_name']} Page: {doc['page_number']}")
            continue
        chunks = chunk_text(doc["cleaned_text"])
        for i, chunk in enumerate(chunks):
            chunked_data.append({
                "pdf_name": doc["pdf_name"],
                "page_number": doc["page_number"],
                "chunk_index": i,
                "text": chunk
            })
    return chunked_data


In [19]:
chunked_data = chunk_documents(preprocessed_data)

# Preview some chunks
for chunk in chunked_data[:3]:
    print(f"{chunk['pdf_name']} | Page {chunk['page_number']} | Chunk {chunk['chunk_index']}")
    print(chunk["text"][:300])  # show first 300 characters
    print("-" * 50)


Dataset1.pdf | Page 1 | Chunk 0
UIN- BAJHLIP23020V012223 Global Health Care/ Policy Wordings/Page 1 Bajaj Allianz General Insurance C o. Ltd. Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or Call at: Sales - 1800 
--------------------------------------------------
Dataset1.pdf | Page 1 | Chunk 1
local authorities, wherever applicable, and is under the supervision of a qualified registered AYUSH Medical Practitioner and must comply with all the following criterion: i. Having at least 5 Inpatient beds; ii. Having qualified AYUSH Medical Practitioner in charge round the clock; iii. Having dedi
--------------------------------------------------
Dataset1.pdf | Page 2 | Chunk 0
UIN- BAJHLIP23020V012223 Global Health Care/ Policy Wordings/Page 2 Bajaj Allianz General Insurance C o. Ltd. Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 For more detail

In [20]:
import spacy
from sklearn.preprocessing import normalize

# Load medium-sized spaCy model
nlp = spacy.load("en_core_web_md")

def embed_chunks_local(chunked_data):
    embedded_chunks = []
    for entry in chunked_data:
        doc = nlp(entry["text"])
        vector = doc.vector
        vector = normalize([vector])[0]  # normalize for cosine similarity

        embedded_chunks.append({
            "pdf_name": entry["pdf_name"],
            "page_number": entry["page_number"],
            "chunk_index": entry["chunk_index"],
            "text": entry["text"],
            "embedding": vector.tolist()
        })
    return embedded_chunks


In [21]:
embedded_chunks = embed_chunks_local(chunked_data)

# Show sample embedded chunk info
for chunk in embedded_chunks[:1]:
    print(f"{chunk['pdf_name']} | Page {chunk['page_number']} | Chunk {chunk['chunk_index']}")
    print(f"Vector shape: {len(chunk['embedding'])} values")


Dataset1.pdf | Page 1 | Chunk 0
Vector shape: 300 values


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_relevant_chunks(query, embedded_chunks, top_k=5):
    query_vector = nlp(query).vector.reshape(1, -1)  # Embed the query
    chunk_vectors = np.array([chunk['embedding'] for chunk in embedded_chunks])  # All chunk vectors

    similarities = cosine_similarity(query_vector, chunk_vectors)[0]  # Cosine similarity

    # Top-k highest similarity indices
    top_indices = similarities.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        chunk = embedded_chunks[idx]
        results.append({
            "pdf_name": chunk["pdf_name"],
            "page_number": chunk["page_number"],
            "chunk_index": chunk["chunk_index"],
            "similarity": round(similarities[idx], 3),
            "text": chunk["text"]
        })

    return results


In [23]:
query = "Is knee surgery covered under a 3-month-old policy?"

relevant_chunks = find_relevant_chunks(query, embedded_chunks, top_k=5)

for chunk in relevant_chunks:
    print(f"{chunk['pdf_name']} | Page {chunk['page_number']} | Similarity: {chunk['similarity']}")
    print(chunk["text"][:300])
    print("-" * 50)


Document5.pdf | Page 10 | Similarity: 0.877
- The Network Provider /Health Service Provider shall be assigned by Us post receiving Insured Person’s request to avail a health check-up under this Benefit. - Utilisation of this preventive health check-up will not impact the Annual Sum Insured or eligibility for additional sum insured /cumulativ 
--------------------------------------------------
Dataset1.pdf | Page 21 | Similarity: 0.875
UIN- BAJHLIP23020V012223 Global Health Care/ Policy Wordings/Page 21 Bajaj Allianz General Insurance C o. Ltd. Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 For more details, log on to: www.bajajallianz.com | E -mail: bagichelp@bajajallianz.co.in or Call at: Sales - 1800
--------------------------------------------------
Document3.pdf | Page 2 | Similarity: 0.875
Well Baby Well Mother - Add On Wordings Add On Wordings - Well Baby Well Mother Base Product UIN: EDLHLGP21462V032021 Add On UIN: EDLHLGA23009V012223 Edelweiss Genera

In [24]:
from llama_cpp import Llama

# ✅ Update this to your actual full path to the .gguf file
model_path = r"C:\Users\HONER\Models\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"


# ✅ Load the model
llm = Llama(model_path=model_path, n_ctx=2048)


ValueError: Model path does not exist: C:\Users\HONER\Models\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf

In [25]:
def build_llama_prompt(query, relevant_chunks):
    context = "\n\n".join([chunk["text"] for chunk in relevant_chunks])
    prompt = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt


In [26]:
def build_prompt(query, relevant_chunks):
    context = "\n\n".join([chunk["text"] for chunk in relevant_chunks])
    prompt = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt


In [27]:
response = llm.create(...)

from llama_cpp import Llama

# Update this with your actual .gguf model file path
model_path = r"C:\Users\HONER\Downloads\llama-2-7b.Q2_K.gguf"

llm = Llama(model_path=model_path)

def build_prompt(query, relevant_chunks):
    context = "\n\n".join([chunk["text"] for chunk in relevant_chunks])
    prompt = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt

def generate_answer_llama_cpp(prompt, max_tokens=256):
    response = llm.create(
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.7,
        top_p=0.9,
        stop=["\n\n"]
    )
    return response['choices'][0]['text'].strip()

# Example usage:
query = "Is knee surgery covered under a 3-month-old policy?"
prompt = build_prompt(query, relevant_chunks)  # Make sure relevant_chunks is already defined
answer = generate_answer_llama_cpp(prompt)
print("Answer:", answer)



AttributeError: 'Llama' object has no attribute 'create'

In [28]:
def build_prompt(query, relevant_chunks, max_chunks=2):
    # Use only the top-N relevant chunks to reduce prompt size
    limited_chunks = relevant_chunks[:max_chunks]
    context = "\n\n".join([chunk["text"] for chunk in limited_chunks])
    prompt = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt


In [29]:
prompt = build_prompt(query, relevant_chunks[:1])  # Just 1 chunk


In [30]:
from llama_cpp import Llama

# Path to your downloaded .gguf model
model_path = r"C:\Users\HONER\Downloads\llama-2-7b.Q2_K.gguf"

# Load the model
llm = Llama(model_path=model_path)

# Function to count tokens in a prompt
def count_tokens(prompt):
    return len(llm.tokenize(prompt.encode("utf-8")))

# Function to build a prompt with token safety
def build_prompt(query, relevant_chunks, max_tokens_allowed=500):
    context = ""
    used_chunks = []
    
    for chunk in relevant_chunks:
        candidate = context + "\n\n" + chunk["text"]
        prompt_candidate = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{candidate}

Question:
{query}

Answer:"""
        if count_tokens(prompt_candidate) <= max_tokens_allowed:
            context = candidate
            used_chunks.append(chunk)
        else:
            break

    prompt = f"""You are an expert assistant. Using the following context, answer the question accurately.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt

# Function to generate the answer
def generate_answer_llama_cpp(prompt, max_tokens=256):
    response = llm(
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.7,
        top_p=0.9,
        stop=["\n\n"]
    )
    return response['choices'][0]['text'].strip()

# Example usage:
query = "Is knee surgery covered under a 3-month-old policy?"

# `relevant_chunks` should come from your vector search
# Example dummy content (replace with real relevant_chunks from find_relevant_chunks)
relevant_chunks = [
    {"text": "Knee surgery is typically covered under health policies after a waiting period of 2 to 3 years."},
    {"text": "However, if the injury is accidental and not related to pre-existing conditions, it may be covered earlier."},
    {"text": "Policies under 3 months usually don’t cover planned surgeries unless explicitly mentioned in the policy."}
]

# Build safe prompt
prompt = build_prompt(query, relevant_chunks)

# Get model answer
answer = generate_answer_llama_cpp(prompt)
print("Answer:", answer)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from C:\Users\HONER\Downloads\llama-2-7b.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u

Answer: No, knee surgery is not covered under a 3-month-old policy.


In [31]:
query = "Is knee surgery covered under a 3-month-old policy?"
answer = generate_answer_llama_cpp(prompt)
print("Answer:", answer)

Llama.generate: 123 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   61267.65 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   10604.51 ms /    17 runs   (  623.79 ms per token,     1.60 tokens per second)
llama_perf_context_print:       total time =   10619.86 ms /    18 tokens


Answer: It is not covered under a 3-month-old policy.


In [35]:
query = "46M, knee surgery, Pune, 3-month policy"

answer = generate_answer_llama_cpp(prompt)
print("Answer:", answer)

Llama.generate: 123 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   61267.65 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   10978.17 ms /    22 runs   (  499.01 ms per token,     2.00 tokens per second)
llama_perf_context_print:       total time =   10992.46 ms /    23 tokens


Answer: No, knee surgery is not covered under a 3-month-old policy.


In [33]:
import json

# Save embedded chunks to a JSON file
with open("embedded_chunks.json", "w", encoding="utf-8") as f:
    json.dump(embedded_chunks, f, ensure_ascii=False, indent=2)
