In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/prod-rag/Guyton and Hall Textbook of Medical Physiology ( PDFDrive ).pdf


In [None]:
!pip install PyMuPDF 
!pip install tqdm  
!pip install accelerate
!pip install bitsandbytes 
!pip install flash-attn --no-build-isolation 

In [4]:
import fitz
from tqdm.auto import tqdm 


def text_formatter(text:str)->str:
    clean_text = text.replace("\n" , " ").strip()
    return clean_text
def open_read_pdf(pdf_path:str) -> str : 
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_num , page in tqdm(enumerate(doc)) : 
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number":page_num-20,
                                "page_char_count":len(text),
                                "page_word_count":len(text.split(" ")),
                                "page_sentence_count_raw":len(text.split('. ')),
                                "page_token_count":len(text)/4 ,# 1 token ~ 4 chars
                                "text":text})
        
    return pages_and_texts    

In [5]:
import random
pages_and_texts = open_read_pdf("/kaggle/input/prod-rag/Guyton and Hall Textbook of Medical Physiology ( PDFDrive ).pdf")
random.sample(pages_and_texts,2)

0it [00:00, ?it/s]

[{'page_number': 582,
  'page_char_count': 3843,
  'page_word_count': 626,
  'page_sentence_count_raw': 33,
  'page_token_count': 960.75,
  'text': 'Unit IX\u2003 The Nervous System: A. General Principles and Sensory Physiology 582 the relay nuclei. This does two things: First, it decreases  lateral spread of the sensory signals into adjacent neurons  and, therefore, increases the degree of sharpness in the sig- nal pattern. Second, it keeps the sensory system operating  in a range of sensitivity that is not so low that the signals  are ineffectual nor so high that the system is swamped  beyond its capacity to differentiate sensory patterns. This  principle of corticofugal sensory control is used by all sen- sory systems, not only the somatic system, as explained in  \xadsubsequent chapters. Segmental Fields of Sensation—Dermatomes Each spinal nerve innervates a “segmental field” of the skin  called a dermatome. The different dermatomes are shown in  Figure 47-14. They are shown in the

## Chunking ==> LLM based chunking

In [None]:
import requests
from typing import List, Dict

HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct"
HF_HEADERS = {"Authorization": " ##### Access Token ##### "}  # paste your token here

def llm_based_chunking(text: str, chunk_size: int = 1000) -> List[Dict]:
    """
    Using Hugging Face hosted LLM to find semantically coherent chunk boundaries
    given a target chunk size.
    """
    
    def get_chunk_boundary(text_segment:str) -> int : 
        prompt = f"""
            Analyze the following text and identify the best point to split it 
            into two semantically coherent parts.
            The split should occur near {chunk_size} characters 
            Text:
            \"\"\"{text_segment}"
            Return only the integer index (character position) within this text
            where the split should occur.Do not return any explanation.
            """
            
        payload = {"inputs": prompt, "parameters": {"max_new_tokens": 20}}
        response = requests.post(HF_API_URL, headers=HF_HEADERS, json=payload)
        result = response.json()

        if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
            split_str = result[0]["generated_text"].strip()
        else:
            split_str = str(chunk_size)

        try:
            split_point = int(split_str)
        except ValueError:
            split_point = chunk_size

        return split_point
    chunks = []
    remaining_text = text 
    while len(remaining_text) > chunk_size : 
        text_window = remaining_text[:chunk_size]
        split_point  = get_chunk_boundary(text_window)
        if split_point < 100 or split_point> len(text_window)-100 : 
            split_point = chunk_size
        chunks.append(remaining_text[:split_point].strip())
        remaining_text = remaining_text[split_point:].strip()
    if remaining_text:
        chunks.append(remaining_text)
    return chunks

def llm_based_chunk_pdf_pages(pages_and_texts:List[Dict],chunk_size:int=1000) -> List[Dict] : 
    all_chunks = []
    for page in tqdm(pages_and_texts,desc="LLM based chunking pages") : 
        page_number = page["page_number"]
        page_text = page["text"]
        
        chunks = llm_based_chunking(page_text,chunk_size)
        for i,chunk in enumerate(chunks) : 
            all_chunks.append({
                "page_number":page_number,
                "chunk_index" : i, 
                "chunk_char_count":  len(chunk),
                "chunk_word_count" : len(chunk.split()),
                "chunk_token_count": len(chunk)/4,
                "chunk_text":chunk
            })
    return all_chunks
llm_chunked_pages = llm_based_chunk_pdf_pages(pages_and_texts,1000)

LLM based chunking pages:   0%|          | 0/1112 [00:00<?, ?it/s]

In [7]:
import pandas as pd
df = pd.DataFrame(llm_chunked_pages)
df.describe().round(2)

Unnamed: 0,page_number,chunk_index,chunk_char_count,chunk_word_count,chunk_token_count
count,5733.0,5733.0,5733.0,5733.0,5733.0
mean,556.98,2.33,896.46,140.7,224.11
std,320.5,1.76,248.84,40.55,62.21
min,-19.0,0.0,1.0,1.0,0.25
25%,279.0,1.0,999.0,144.0,249.75
50%,561.0,2.0,1000.0,154.0,250.0
75%,836.0,4.0,1000.0,162.0,250.0
max,1091.0,7.0,1000.0,263.0,250.0


In [8]:
min_token_len = 30
llm_chunked_pages_min_token_len = df[df["chunk_token_count"]>min_token_len]
llm_chunked_pages_min_token_len.describe().round(2)

Unnamed: 0,page_number,chunk_index,chunk_char_count,chunk_word_count,chunk_token_count
count,5538.0,5538.0,5538.0,5538.0,5538.0
mean,558.77,2.29,926.05,145.34,231.51
std,321.06,1.71,195.72,32.64,48.93
min,-17.0,0.0,122.0,16.0,30.5
25%,280.0,1.0,999.0,145.25,249.75
50%,563.0,2.0,1000.0,155.0,250.0
75%,839.0,4.0,1000.0,162.0,250.0
max,1091.0,7.0,1000.0,263.0,250.0


In [9]:
llm_chunked_pages_min_token_len = llm_chunked_pages_min_token_len.to_dict(orient="records")
llm_chunked_pages_min_token_len[:2]

[{'page_number': -17,
  'chunk_index': 0,
  'chunk_char_count': 275,
  'chunk_word_count': 48,
  'chunk_token_count': 68.75,
  'chunk_text': 'Guyton and Hall  Textbook of Medical Physiology John E. Hall, Ph.D. Arthur C. Guyton Professor and Chair Department of Physiology and Biophysics Associate Vice Chancellor for Research  University of Mississippi Medical Center Jackson, Mississippi T w e l f t h  E d i t i o n'},
 {'page_number': -16,
  'chunk_index': 0,
  'chunk_char_count': 999,
  'chunk_word_count': 138,
  'chunk_token_count': 249.75,
  'chunk_text': '1600 John F. Kennedy Blvd. Ste 1800 Philadelphia, PA 19103-2899 TEXTBOOK OF MEDICAL PHYSIOLOGY\t ISBN: 978-1-4160-4574-8 \x08 International Edition: 978-0-8089-2400-5 Copyright © 2011, 2006, 2000, 1996, 1991, 1986, 1981, 1976, 1966,   1961, 1956 by Saunders, an imprint of Elsevier Inc. All rights reserved. No part of this publication may be reproduced or transmitted in any form   or by any means, electronic or mechanical, including

## Chunks Embedding

In [12]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cpu")

2025-09-27 21:10:06.910422: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759007407.106664      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759007407.158282      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
embeddings = embedding_model.encode(
    [chunk["chunk_text"] for chunk in llm_chunked_pages_min_token_len[:3]]
    )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
embeddings_dict = dict(zip([chunk["chunk_text"] for chunk in llm_chunked_pages_min_token_len[:3]],embeddings))
for k,v in embeddings_dict.items() : 
    print(f"Text: {k}... \nEmbedding: {v[:20]} ... \n")

Text: Guyton and Hall  Textbook of Medical Physiology John E. Hall, Ph.D. Arthur C. Guyton Professor and Chair Department of Physiology and Biophysics Associate Vice Chancellor for Research  University of Mississippi Medical Center Jackson, Mississippi T w e l f t h  E d i t i o n... 
Embedding: [ 0.01668348 -0.10960329  0.01991736 -0.00495167 -0.01372136  0.00017458
  0.01182713  0.02109351 -0.0028191  -0.00118768 -0.00013009 -0.04651693
  0.07204641 -0.06539473  0.03219455  0.0594519  -0.04491937 -0.00309234
 -0.05016524 -0.02944753] ... 

Text: 1600 John F. Kennedy Blvd. Ste 1800 Philadelphia, PA 19103-2899 TEXTBOOK OF MEDICAL PHYSIOLOGY	 ISBN: 978-1-4160-4574-8 International Edition: 978-0-8089-2400-5 Copyright © 2011, 2006, 2000, 1996, 1991, 1986, 1981, 1976, 1966,   1961, 1956 by Saunders, an imprint of Elsevier Inc. All rights reserved. No part of this publication may be reproduced or transmitted in any form   or by any means, electronic or mechanical, including photocopying, re

In [15]:
from tqdm.auto import tqdm
embedding_model.to("cuda")
for item in tqdm(llm_chunked_pages_min_token_len) : 
    item["embedding"] = embedding_model.encode(item["chunk_text"],show_progress_bar=False)

  0%|          | 0/5538 [00:00<?, ?it/s]

In [16]:
### batched operations for chunks embedding
text_chunks = [item["chunk_text"] for item in llm_chunked_pages_min_token_len]

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                              batch_size=32,
                                              convert_to_tensor=True,
                                              show_progress_bar=False)
text_chunk_embeddings

tensor([[ 1.6683e-02, -1.0960e-01,  1.9917e-02,  ..., -5.3247e-02,
         -3.3039e-02, -1.0273e-02],
        [ 9.2483e-03, -1.0002e-01,  3.0459e-02,  ..., -4.6283e-02,
         -4.5977e-02, -1.9497e-02],
        [ 3.0354e-02, -8.3092e-02,  2.0526e-02,  ..., -1.1177e-02,
         -5.7592e-02, -2.0364e-02],
        ...,
        [ 1.5164e-02,  4.1928e-03,  1.4332e-02,  ..., -2.3856e-02,
         -5.2739e-02, -4.3403e-02],
        [-4.4826e-06,  1.0175e-02, -1.2562e-02,  ..., -3.5670e-02,
          2.6308e-02, -2.3272e-02],
        [ 3.6429e-04, -4.6786e-02,  1.8919e-02,  ..., -1.2068e-02,
         -2.4359e-02, -4.0288e-02]], device='cuda:0')

In [17]:
text_chunk_embeddings.shape

torch.Size([5538, 768])

In [18]:
text_chunks_and_embeddings_df = pd.DataFrame(llm_chunked_pages_min_token_len)
path = "chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(path,index=False)

In [19]:
## load the csv file 
df = pd.read_csv(path)
df[50:55]

Unnamed: 0,page_number,chunk_index,chunk_char_count,chunk_word_count,chunk_token_count,chunk_text,embedding
50,3,3,999,148,249.75,n constant motion throughout the body. It is t...,[ 4.34761569e-02 -1.06346898e-01 5.12604602e-...
51,3,4,131,22,32.75,"ions plus nutrients for the cells, such as ox...",[ 3.15305516e-02 -8.65034908e-02 1.66664161e-...
52,4,0,1000,143,250.0,Unit I Introduction to Physiology: The Cell a...,[-3.71109601e-03 -1.18160553e-01 7.76186958e-...
53,4,1,1000,157,250.0,"tions. For instance, the lungs provide oxygen ...",[ 1.12578906e-02 -7.30112419e-02 -3.59577709e-...
54,4,2,1000,167,250.0,shows the overall circulation of blood. All t...,[ 9.69573448e-04 -1.71220392e-01 1.23821674e-...


In [20]:
import random 
import numpy as np 
import torch
device = "cuda"
df["embedding"] = df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"),sep=" ")) 
pages_and_chunks = df.to_dict(orient="records")
embeddings = torch.tensor(np.array(df["embedding"].tolist()),dtype=torch.float32).to(device)
embeddings.shape

torch.Size([5538, 768])

In [21]:
embeddings[0][:50]

tensor([ 1.6683e-02, -1.0960e-01,  1.9917e-02, -4.9517e-03, -1.3721e-02,
         1.7462e-04,  1.1827e-02,  2.1093e-02, -2.8191e-03, -1.1877e-03,
        -1.3004e-04, -4.6517e-02,  7.2046e-02, -6.5395e-02,  3.2195e-02,
         5.9452e-02, -4.4919e-02, -3.0924e-03, -5.0165e-02, -2.9447e-02,
         3.4552e-02,  2.9689e-02, -3.0637e-02,  4.3992e-02,  3.5926e-02,
         1.3699e-03,  3.1051e-02, -4.0769e-02, -2.4498e-02, -2.3367e-02,
        -3.0137e-02,  1.6295e-02, -4.6542e-03, -4.2574e-02,  2.1397e-06,
        -4.5017e-02,  1.7824e-02,  5.7495e-02, -1.0390e-02,  9.1406e-03,
         1.9712e-02, -5.3414e-02,  2.7400e-03, -1.1638e-02, -3.1642e-02,
        -2.2854e-02,  8.7050e-03, -7.8209e-03,  1.4728e-02,  1.5848e-02],
       device='cuda:0')

In [22]:
from sentence_transformers import util,SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device=device)

## Retreival Step

In [23]:
query = "What is the role of the sinoatrial node in the heart?"
print(f"Query : {query}")

## embedding the query 
query_emb = embedding_model.encode(query,convert_to_tensor=True,show_progress_bar=False)
embeddings.to(device)
## calculate similarity between the query and the book embeddings
from time import perf_counter as timer 
start_time = timer()
dot_products = util.dot_score(a=query_emb,b=embeddings)[0]
end_time = timer()

print(f"Time taken to get the dot products : {end_time-start_time:.5f} seconds")

## getting the top k results
top_results = torch.topk(dot_products,k=5)
top_results

Query : What is the role of the sinoatrial node in the heart?
Time taken to get the dot products : 0.00037 seconds


torch.return_types.topk(
values=tensor([0.6564, 0.6354, 0.6128, 0.6032, 0.5990], device='cuda:0'),
indices=tensor([576, 527, 577, 597, 603], device='cuda:0'))

In [24]:
import textwrap
def print_wrapped(text,wrap_length=80):
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

print(f"Query : {query}")
print(f"Results:\n")
for score,idx in zip(top_results[0],top_results[1]):
    print(f"Score :{score:.4f}")
    print(f"Text:")
    print_wrapped(pages_and_chunks[idx]['chunk_text'])
    print(f"page number : {pages_and_chunks[idx]['page_number']}")
    print(f"\n")

Query : What is the role of the sinoatrial node in the heart?
Results:

Score :0.6564
Text:
he heart cham- bers, and the pumping effectiveness of the heart often is
affected severely, even to the extent of causing death. Specialized Excitatory
and Conductive  System of the Heart Figure 10-1 shows the specialized excitatory
and conduc- tive system of the heart that controls cardiac contractions.  The
figure shows the sinus node (also called sinoatrial or  S-A node), in which the
normal rhythmical impulses are  generated; the internodal pathways that conduct
impulses  from the sinus node to the atrioventricular (A-V) node;  the A-V node,
in which impulses from the atria are delayed  before passing into the
ventricles; the A-V bundle, which  conducts impulses from the atria into the
ventricles; and  the left and right bundle branches of Purkinje fibers, which
conduct the cardiac impulses to all parts of the ventricles. Sinus (Sinoatrial)
Node The sinus node (also called sinoatrial node) i

In [27]:
def retreive_relevant_resources(query:str,embeddings:torch.tensor,model:SentenceTransformer=embedding_model,k:int=5,print_time:bool=True) : 
    ## embedding the query
    query_emb = model.encode(query,convert_to_tensor=True,show_progress_bar=False)
    start_time = timer()
    dot_products = util.dot_score(a=query_emb,b=embeddings)[0]
    end_time = timer()
    if print_time : 
        print(f"[INFO] Time taken to get the dot products : {end_time-start_time:.5f} seconds")
    scores,idx = torch.topk(dot_products,k=k)
    return scores,idx

def print_top_results_and_scores(query:str,embeddings:torch.tensor,pages_and_chunks:list[dict]=pages_and_chunks,k:int=5):
    scores,idx = retreive_relevant_resources(query,embeddings,k=k)
    print(f"Query : {query}")
    print(f"Results:\n")
    for score,idx in zip(scores,idx):
        print(f"Score :{score:.4f}")
        print(f"Text:")
        print_wrapped(pages_and_chunks[idx]['chunk_text'])
        print(f"page number : {pages_and_chunks[idx]['page_number']}")
        print(f"\n")

In [28]:
query = "What is the role of hemoglobin in oxygen transport?"
print_top_results_and_scores(query,embeddings,)

[INFO] Time taken to get the dot products : 0.00009 seconds
Query : What is the role of hemoglobin in oxygen transport?
Results:

Score :0.7342
Text:
oxygen as it passes through  the tissue capillaries is called the utilization
coefficient. The  normal value for this is about 25 percent, as is evident from
the preceding discussion—that is, 25 percent of the oxy- genated hemoglobin
gives its oxygen to the tissues. During  strenuous exercise, the utilization
coefficient in the entire  body can increase to 75 to 85 percent. And in local
tissue  areas where blood flow is extremely slow or the metabolic  rate is very
high, utilization coefficients approaching 100  percent have been recorded—that
is, essentially all the oxy- gen is given to the tissues. Effect of Hemoglobin
to “Buffer”  the Tissue Po2 Although hemoglobin is necessary for the  transport
of oxygen to the tissues, it performs another  function essential to life. This
is its function as a “tissue  oxygen buffer” system. That is,

## Generation Step

In [29]:
gpu_memory_gb = round(torch.cuda.get_device_properties(0).total_memory / (2**30))
print(f"Available GPU memory : {gpu_memory_gb} GB")

Available GPU memory : 16 GB


In [30]:
from huggingface_hub import login 
login(token="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")


In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)
model_id = "google/gemma-2b-it"
use_quantization_config = False
attn_impl = "sdpa" # scaled dot product attention
tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                attn_implementation=attn_impl) 
if not use_quantization_config : 
    llm_model.to(device)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [39]:
print(f"Model device : ",llm_model.device)
llm_model

Model device :  cuda:0


GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

In [48]:
print(f"Number of params : {sum([param.numel() for param in llm_model.parameters()])}")

Number of params : 2506172416


In [53]:
## testing the model
input_text = "What is the role of hemoglobin in oxygen transport?"
print(f"Input text : {input_text}\n")
dialogue_template = [{
    "role" : "user",
    "content" : input_text
}]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                      tokenize=False,
                                      add_generation_prompt=True)
print(f"Fromatted prompt : {prompt}")


Input text : What is the role of hemoglobin in oxygen transport?

Fromatted prompt : <bos><start_of_turn>user
What is the role of hemoglobin in oxygen transport?<end_of_turn>
<start_of_turn>model



In [56]:
input_ids = tokenizer(prompt,return_tensors="pt").to("cuda")
print(f"Model input : {input_ids}")

Model input : {'input_ids': tensor([[     2,      2,    106,   1645,    108,   1841,    603,    573,   4731,
            576, 105358,    575,  16175,   6676, 235336,    107,    108,    106,
           2516,    108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [61]:
outputs = llm_model.generate(**input_ids,max_new_tokens=256)
print(f"Model output :\n{outputs[0]}")

Model output :
tensor([     2,      2,    106,   1645,    108,   1841,    603,    573,   4731,
           576, 105358,    575,  16175,   6676, 235336,    107,    108,    106,
          2516,    108,  21404, 235269,   1517, 235303, 235256,    573,   4731,
           576, 105358,    575,  16175,   6676, 235292,    109,    688, 235274,
        235265,  55847,    577,  56134,  66058,  21072, 110865,    603,    476,
          9646,   1942,    575,   3118,   5330,   5999,    674,  76686,    577,
         16175,  24582, 235265,  21072, 110865,    919,   2785, 152987,  25086,
        235269,   1853,    576,    948,    603,  42554,   1280,    476,   7419,
          3049,  11988, 235265,   3766,   7419,   3049,  37879,    708,  20677,
           575,    476,  58228,  23184,   7479, 235269,    675,    573,   2785,
         25086,  43768,   2449,   1853,   1156,    577,   1736,    476,    700,
        106914,   5449, 235265,    109,    688, 235284, 235265,  21072, 110865,
        235290, 104580,  

In [62]:
print(f"Raw text outputs : {tokenizer.decode(outputs[0])}")

Raw text outputs : <bos><bos><start_of_turn>user
What is the role of hemoglobin in oxygen transport?<end_of_turn>
<start_of_turn>model
Sure, here's the role of hemoglobin in oxygen transport:

**1. Binding to Oxygen:** Hemoglobin is a protein found in red blood cells that binds to oxygen molecules. Hemoglobin has four polypeptide chains, each of which is folded into a globin domain. These globin domains are arranged in a tetrahedral shape, with the four chains twisted around each other to form a quaternary structure.

**2. Hemoglobin-Oxygen Complex:** When hemoglobin binds to oxygen, the four polypeptide chains rearrange to form a more compact structure called the hemoglobin-oxygen complex. This complex has a higher affinity for oxygen than the deoxy-form of hemoglobin, which is found in the lungs.

**3. Oxygen Transport:** When the hemoglobin-oxygen complex is bound to oxygen, it is carried from the lungs to the tissues. The oxygen molecules bind to the hemoglobin molecules in the red

In [77]:
from typing import List, Dict, Optional

def prompt_formatter(
    query: str,
    context_items: List[Dict],
    examples: Optional[List[Dict[str, str]]] = None,
    include_metadata: bool = False
) -> str:
    """
    Augments query with text-based context from context_items.

    Args:
        query (str): User query
        context_items (List[Dict]): List of dicts with at least a "sentence_chunk" key
        examples (List[Dict[str, str]], optional): Few-shot examples with "query" and "answer"
        include_metadata (bool, optional): If True, append metadata from context items

    Returns:
        str: Formatted prompt string
    """

    #  Handle case of empty context
    if not context_items:
        context = "No additional context available."
    else:
        formatted_chunks = []
        for item in context_items:
            chunk = item.get("chunk_text", "").strip()
            if include_metadata:
                metadata = []
                if "source" in item:
                    metadata.append(f"Source: {item['source']}")
                if "page" in item:
                    metadata.append(f"Page: {item['page']}")
                meta_str = " | ".join(metadata)
                formatted_chunks.append(f"- {chunk} ({meta_str})" if meta_str else f"- {chunk}")
            else:
                formatted_chunks.append(f"- {chunk}")
        context = "\n".join(formatted_chunks)

    # Default examples (if none provided)
    if examples is None:
        examples = [ 
            { 
                "query": "What are the primary functions of the kidney?", 
                "answer": "The kidneys regulate fluid and electrolyte balance, remove metabolic waste products, control blood pressure via the renin-angiotensin system, and produce hormones such as erythropoietin." 
            }, 
            { 
                "query": "How does the Frank-Starling law regulate cardiac output?", 
                "answer": "The Frank-Starling law states that the stroke volume of the heart increases in response to an increase in venous return (end-diastolic volume). This ensures balance between the output of the right and left ventricles." 
            },
            { 
                "query": "What is the role of surfactant in the lungs?", 
                "answer": "Pulmonary surfactant reduces surface tension within the alveoli, preventing alveolar collapse during expiration and making breathing more efficient." 
            } 
        ]

    #  Format few-shot examples
    example_texts = []
    for i, ex in enumerate(examples, 1):
        example_texts.append(
            f"Example {i}:\n"
            f"Query: {ex['query']}\n"
            f"Answer: {ex['answer']}\n"
        )
    examples_block = "\n".join(example_texts)

    #  Final base prompt
    base_prompt = f"""
<bos><start_od_turn>user
You are an assistant that answers queries using the provided context.

Guidelines:
- First, extract relevant information from the context before answering.
- Be concise but explanatory.
- If the context does not contain the answer, explicitly say so.
- Always ground your response in the context.

Here are examples of the desired answer style:
{examples_block}

Now use the following context items to answer the user query:
{context}

Relevant passages: <extract relevant info from the context here>

User query: {query}
Answer:<end_of_turn>
<start_of_turn>model
"""
    return base_prompt.strip()

In [78]:
query = "What is the role of the sinoatrial (SA) node in cardiac physiology?"
print(f"Query : {query}")
scores,indices = retreive_relevant_resources(query,embeddings)
context_items = [pages_and_chunks[i] for i in indices]
prompt = prompt_formatter(query,context_items)
print(prompt)

Query : What is the role of the sinoatrial (SA) node in cardiac physiology?
[INFO] Time taken to get the dot products : 0.00007 seconds
<bos><start_od_turn>user
You are an assistant that answers queries using the provided context.

Guidelines:
- First, extract relevant information from the context before answering.
- Be concise but explanatory.
- If the context does not contain the answer, explicitly say so.
- Always ground your response in the context.

Here are examples of the desired answer style:
Example 1:
Query: What are the primary functions of the kidney?
Answer: The kidneys regulate fluid and electrolyte balance, remove metabolic waste products, control blood pressure via the renin-angiotensin system, and produce hormones such as erythropoietin.

Example 2:
Query: How does the Frank-Starling law regulate cardiac output?
Answer: The Frank-Starling law states that the stroke volume of the heart increases in response to an increase in venous return (end-diastolic volume). This en

In [79]:
input_ids = tokenizer(prompt,return_tensors="pt").to("cuda")
print(f"Model input : {input_ids}")

Model input : {'input_ids': tensor([[     2,      2, 235322,  ...,    108,    106,   2516]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')}


In [86]:
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256)

In [95]:
out_text = tokenizer.decode(outputs[0])
print(out_text.replace(prompt,''))

<bos>Sure, here's the relevant information from the context about the role of the sinoatrial (SA) node in cardiac physiology:

The sinoatrial (SA) node is a small, flat-tened, ellipsoid strip of specialized cardiac muscle about 3 millimeters wide, 15 millimeters long, and 1 millimeter thick. It is located in the superior posterolateral wall of the right atrium immediately below and slightly lateral to the opening of the superior vena cava. The fibers of this node have almost no contractile muscle filaments and are each only 3 to 5 micrometers in diameter, in contrast to a diameter of 10 to 15 micrometers for the surrounding atrial muscle fibers. However, the SA nodal fibers connect directly with the atrial muscle fibers so that any action potential that begins in the SA node spreads immediately into the atrial muscle wall.<eos>


## All in oe function

In [100]:
def ask(query:str,
       max_new_tokens:int=512,
       format_answer:bool=True,
       return_answer_only:bool=True) : 
    scores,indices = retreive_relevant_resources(query,embeddings)
    context_items = [pages_and_chunks[i] for i in indices]
    prompt = prompt_formatter(query,context_items)
    input_ids = tokenizer(prompt,return_tensors="pt").to("cuda")
    outputs = llm_model.generate(**input_ids,
                             max_new_tokens=max_new_tokens)
    out_text = tokenizer.decode(outputs[0])
    if format_answer : 
        out_text = out_text.replace(prompt,'').replace("<bos>","").replace("<eos>","")
    if return_answer_only : 
        return out_text
    return out_text , context_items
    

In [101]:
print(ask("Describe the structure, location, and functional significance of the atrioventricular (AV) node in cardiac conduction."))

[INFO] Time taken to get the dot products : 0.00008 seconds
Sure, here's the answer to the user's query:

The atrioventricular (AV) node is a specialized structure located in the posterior wall of the right atrium. It is responsible for delaying the propagation of the cardiac impulse from the atria to the ventricles. This delay ensures that the atria have sufficient time to empty their blood into the ventricles before ventricular contraction begins.

The AV node is organized into several parts, including the A-V bundle, the atrial interodal pathway, and the ventricular septum. The A-V bundle is a specialized conductive system that carries the cardiac impulse from the atria to the ventricles. The atrial interodal pathway is a network of fibers that connect the atria to the AV node. The ventricular septum is a thin membrane that separates the atria from the ventricles.

The functional significance of the AV node is to ensure that the cardiac impulse is propagated from the atria to the ve