<a href="https://colab.research.google.com/github/Anil-Babu-Yadav-Jenige/RAG/blob/main/RAG_Virt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# perform google colab installs
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    #!pip install -U torch  #requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading pdfs with python
    !pip install tqdm #for progress bars
    !pip install accelerate # for quatization model loading
    !pip install bitsandbytes # for quantising models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanisms = faster LLM inference



In [None]:
!pip install -y torch torchvision torchaudio transformers sentence-transformers
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -U transformers sentence-transformers

1.Document/Text Procesing and Embedding Creation

In [None]:
#Download pdf file
import os
import requests

# get pdf document
pdf_path = "human-nutrition-text.pdf"

#download pdf if it does not already exists
if not os.path.exists(pdf_path):
  print("file doesn't exist, downloading...")

  #the URL of the pdf
  #url = "https://pressbooks.oer.hawaii.edu/humannutristion2/open/download?type=pdf"
  #url = "https://pressbooks.oer.hawaii.edu/humannutrition2/"
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"


  #the local file name to save the downloaded file
  filename = pdf_path

  #send a GET request to the url
  response = requests.get(url)

  #check if the request was successful
  if response.status_code == 200:
      #open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"file downloaded successfully and saved as {filename}")
  else:
      print("failed to download the file. Status code: {response.status_code}")
else:
  print(f"file {pdf_path} already exists.")



Read the pdf

In [None]:
# Requires !pip install PyMupdf, see pymupdf git hub
!pip install pymupdf
import fitz # (pymupdf, found this is better that pypdf for our use case) loads pymu package
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm

def text_formatter(text: str) -> str: #tf make sure no empty spaces
  """Perform minor formatting on text,"""
  cleaned_text = text.replace("\n", " ").strip() # note: this might be different

  #other potential text formatting functions can go here
  return cleaned_text

#open pdf and get lines/pages
#note this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
  """
  Opens a pdf file, reads its text context page by page, and collects statistics.

  Parameters:
      pdf_path (str): The path to the PDF file to be opend and read.

  Returns:
      list[dict]: A list of dictionaries, each containing the page number(adjusted), character count, word count, sentence count, token count, and the extracted text for each page.
  """
  doc = fitz.open(pdf_path) #open pdf documet
  pages_and_texts = [] # maintaing a list with this name, & each elemnt of this list is a dictionary
  #1st element of this list is page 1 wjhich contains all specified
  for page_number, page in tqdm(enumerate(doc)): #iterate the document pages, go through every page
      text = page.get_text()  #get plain text encoded as utf-8
      text = text_formatter(text) # remove empty spaces SEE NOTES
      pages_and_texts.append({"page_number": page_number -41, #adjust page numbers since our pdf, the page where book actually starts }
                              "page_char_count": len(text),
                              "page_word_count": len(text.split()),
                              "page_sentence_count_raw": len(text.split(".")),
                              "page_token_count": len(text) / 4, # 1 token =~ 4 char
                              "text": text})
  return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2] #randomly print out 2 dictionaries

Now lets get a random sample of the pages

In [None]:
import random

random.sample(pages_and_texts, k=3)

Get some statistics

In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Over all Statistics

In [None]:
#Get status
df.describe().round(2)

# why are we looking at number tokens per page- to check for the context window size
#

**RAG Chunking Strategies.**

Step 0: Load packages


Step 1:Document processing


Step 2:Reading the document.

Step 3: Testing 5 chunking startegies:fixed, recursive, semantic, structural and LLM vased.

**Chunking startegy 1: Fixed size chunking.**

In [None]:
def chunk_text(text: str, chunk_size: int = 500) -> list:
    """
    Splits a given text into chunks of a specified size.
    """
    chunks = []
    words = text.split()
    current_chunk = ""

    for word in words:
        #check if adding the word exceeds chunk size
        if len(current_chunk) + len(word) + 1 <= chunk_size:
            current_chunk += (word + ' ')
        else:
            #store current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = word + ' '

    #add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


def chunk_pdf_pages(pages_and_texts: list, chunk_size: int = 500) -> list[dict]:
    """
    Takes pdfpages with text and splits them into chunks

    Returns a list of dicts with page_number, chunk_index, and chunk_text:
    """
    all_chunks = []
    for page in pages_and_texts:
        page_text = page["text"]
        page_number = page["page_number"]

        chunks = chunk_text(page_text, chunk_size=chunk_size) # pass text of a page to chunk_text function
        for i, chunk in enumerate(chunks):
            all_chunks.append({"page_number": page_number,
                           "chunk_index": i,
                           "chunk_char_count": len(chunk),
                           "chunk_word_count": len(chunk.split()),
                           "chunk_token_count": len(chunk)/4, # rough token estimate
                           "chunk_text": chunk
            })
    return all_chunks

#example usage
chunked_pages = chunk_pdf_pages(pages_and_texts, chunk_size=500)
print(f"Total chunks: {len(chunked_pages)}")
print(f"First chunk (page {chunked_pages[0]['page_number']}): {chunked_pages[0]['chunk_text'][:200]}...")

After this , want to visualize how every chunk actually look like.

In [None]:
import random, textwrap # Import necessary modules: 'random' for sampling, 'textwrap' for text formatting

#---Sampling & Pretty Prinnting---
def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
    """Evenly spaced anchors + random jitter -indices scattered across [0, n-1], """
    # Check if we need to select any indices. If k (number of indices) is zero or less, return an empty list.
    if k <= 0:
      return []
    # If we only need one index, pick a single random index from 0 to n-1.
    if k == 1:
      return [random.randrange(n)]
    # Calculate k evenly spaced anchor points across the range [0, n-1].
    anchors = [int(round(i * (n-1) / (k-1))) for i in range(k)]
    out, seen = [], set()
    radius = max(1, int(n * jitter_frac))

    # Iterate through each anchor point to add random jitter.
    for a in anchors:
        # Define the lower (lo) and upper (hi) bounds for the random jitter, constrained by [0, n-1].
        lo, hi = max(0, a- radius), min(n-1, a+ radius)
        j = random.randint(lo, hi)

        # If the jittered index 'j' hasn't been used yet, add it to the output and the 'seen' set.
        if j not in seen:
            out.append(j); seen.add(j)
    # If the process above resulted in fewer than k unique indices (due to jitter overlap), fill the rest randomly.
    while len(out) < k:
        r = random.randrange(n) # Pick a random index from the whole range [0, n-1].
        # If the index hasn't been used, add it to the output.
        if r not in seen:
            out.append(r); seen.add(r)
    return out

def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
    header = (
        f"Page p{c['page_number']} . idx {c['chunk_index']} | "
        f"chunk {c['chunk_char_count']} . words {c['chunk_word_count']} . ~tokens {c['chunk_token_count']} "
    )
    #wrap body text, avoid breaking long words awkwardly
    wrapped_lines = textwrap.wrap(c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
                                  )
    context_width = max( [0, *map(len, wrapped_lines)])
    box_width = max(len(header), context_width + 2) # +2 for side padding

    # Transcribed code starts here:
    top = "╔" + "=" * box_width + "╗"
    hline = "║ " + header.ljust(box_width) + " ║"
    sep = "╠" + "-" * box_width + "╣"
    body = "\n".join(
        "║ " + line.ljust(box_width - 2) + " ║" for line in wrapped_lines) or ("║ " + "".ljust(box_width- 2) + " ║")
    bottom = "╚" + "=" * box_width + "╝"
    return "\n".join([top, hline, sep, body, bottom])
def show_random_chunks(pages_and_texts: list, chunk_size: int = 500, k: int = 5, seed: int | None = 42):
    if seed is not None:
       random.seed(seed)
    all_chunks = chunk_pdf_pages(pages_and_texts, chunk_size=chunk_size)
    if not all_chunks:
        print("No chunks to display.")
        return
    idxs = _scattered_indices(len(all_chunks), k)
    print(f"Showing {len(idxs)} random chunks out of {len(all_chunks)} total:\n")
    for i, idx in enumerate(idxs, 1):
        print(f"#{i}")
        print(_draw_boxed_chunk(all_chunks[idx]))
        print()

# run
assert 'pages_and_texts' in globals(), "Run: pages_and_texts = open_and_read_pdf(pdf_path) first."
show_random_chunks(pages_and_texts, chunk_size=500, k=5, seed=42)

**Chunking Stratergy 2: Semantic chunking**

In [None]:
!pip -q install --upgrade "sentence-transformers==3.0.1" "transformers<5,>4.41" scikit-learn nltk

In [None]:
from sentence_transformers import SentenceTransformer # Correct import for SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt', quiet=True)

#load once locally
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_chunk_text(text: str, similarity_threshold: float = 0.8, max_tokens: int = 500) -> list:
    """
    splits text into semantic chunks based on sentence similarity and max token lenght.
    """

    sentences = nltk.sent_tokenize(text) #break the page into sentences
    if not sentences:
        return []

    embeddings = semantic_model.encode(sentences)

    chunks = []
    current_chunk = [sentences[0]] #keep on appending to the current chunk until the similarity score is > than the threshold
    current_embedding = embeddings[0]

    for i in range(1, len(sentences)):
        sim = cosine_similarity([current_embedding], [embeddings[i]])[0][0]
        chunk_token_count = len(" ".join(current_chunk)) // 4

        if sim >= similarity_threshold and chunk_token_count <= max_tokens:
            current_chunk.append(sentences[i])
            current_embedding = np.mean([current_embedding, embeddings[i]], axis=0)
        else: #if not > breakout
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]] # Corrected variable name from sentence to sentences
            current_embedding = embeddings[i]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


from tqdm.auto import tqdm

def semantic_chunk_pdf_pages(pages_and_texts: list, similarity_threshold: float = 0.8, max_tokens: int = 500) -> list[dict]:

  """takes pdf pages with text and splits them into semantic chunks.

  ReturNS A LIST OF DICTS WITH PAGE_NUMBER, CHUNK_INDEX, AND CHUNK_TEXT:
  """
  all_chunks = []

  for page in tqdm(pages_and_texts, desc="Semantic chunking pages"): # Corrected capitalization of Tqdm
      page_text = page["text"]
      page_number = page["page_number"]

      chunks = semantic_chunk_text(page_text, similarity_threshold=similarity_threshold, max_tokens=max_tokens)

      for i, chunk in enumerate(chunks):
          all_chunks.append({
              "page_number": page_number, "chunk_index": i,
              "chunk_char_count": len(chunk),
              "chunk_word_count": len(chunk.split()),
              "chunk_token_count": len(chunk) // 4,
              "chunk_text": chunk
          })
  return all_chunks

In [None]:
import nltk
nltk.download('punkt_tab')
semantic_chunked_pages = semantic_chunk_pdf_pages(pages_and_texts, similarity_threshold=0.75, max_tokens= 500)
print(f"Total semantic chunks: {len(semantic_chunked_pages)}")
print(f"First semantic chunk (page{semantic_chunked_pages[0]['page_number']}):")
print(semantic_chunked_pages[0]['chunk_text'][:200] + "...")

In [None]:
import random
import textwrap # Import necessary modules: 'random' for sampling, 'textwrap' for text formatting

#---Sampling & Pretty Prinnting---
def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
    """Evenly spaced anchors + random jitter -indices scattered across [0, n-1], """
    # Check if we need to select any indices. If k (number of indices) is zero or less, return an empty list.
    if k <= 0:
      return []
    # If we only need one index, pick a single random index from 0 to n-1.
    if k == 1:
      return [random.randrange(n)]
    # Calculate k evenly spaced anchor points across the range [0, n-1].
    anchors = [int(round(i * (n-1) / (k-1))) for i in range(k)]
    out, seen = [], set()
    radius = max(1, int(n * jitter_frac))

    # Iterate through each anchor point to add random jitter.
    for a in anchors:
        # Define the lower (lo) and upper (hi) bounds for the random jitter, constrained by [0, n-1].
        lo, hi = max(0, a- radius), min(n-1, a+ radius)
        j = random.randint(lo, hi)

        # If the jittered index 'j' hasn't been used yet, add it to the output and the 'seen' set.
        if j not in seen:
            out.append(j); seen.add(j)
    # If the process above resulted in fewer than k unique indices (due to jitter overlap), fill the rest randomly.
    while len(out) < k:
        r = random.randrange(n) # Pick a random index from the whole range [0, n-1].
        # If the index hasn't been used, add it to the output.
        if r not in seen:
            out.append(r); seen.add(r)
    return out

def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
    header = (
        f"Page p{c['page_number']} . idx {c['chunk_index']} | "
        f"chunk {c['chunk_char_count']} . words {c['chunk_word_count']} . ~tokens {c['chunk_token_count']} "
    )
    #wrap body text, avoid breaking long words awkwardly
    wrapped_lines = textwrap.wrap(c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
                                  )
    context_width = max( [0, *map(len, wrapped_lines)])
    box_width = max(len(header), context_width + 2) # +2 for side padding

    # Transcribed code starts here:
    top = "╔" + "=" * box_width + "╗"
    hline = "║ " + header.ljust(box_width) + " ║"
    sep = "╠" + "-" * box_width + "╣"
    body = "\n".join(
        "║ " + line.ljust(box_width - 2) + " ║" for line in wrapped_lines) or ("║ " + "".ljust(box_width- 2) + " ║")
    bottom = "╚" + "=" * box_width + "╝"
    return "\n".join([top, hline, sep, body, bottom])
def show_random_semantic_chunks(semantic_chunked_pages: list[dict], k: int = 5, seed: int | None = 42):
    if seed is not None:
       random.seed(seed)
    n = len(semantic_chunked_pages)
    if n == 0:
        print("No semantic chunks to display.");
        return
    idxs = _scattered_indices(n, k)
    print(f"Showing {len(idxs)} scattered random SEMANTIC chunks out of {n} total:\n")
    for i, idx in enumerate(idxs, 1):
        print(f"#{i}")
        print(_draw_boxed_chunk(semantic_chunked_pages[idx]))
        print()

# run (expects youo have already created semantic chunked pages)
assert 'semantic_chunked_pages' in globals() and len(semantic_chunked_pages) > 0, \
 "Run your semantic chunking code first to define 'semantic_chunked_pages'."
show_random_semantic_chunks(semantic_chunked_pages, k=5, seed=42)

**Chunking Strategy 3: Recursive chunking**
keypoints


*   Recursive chunking prioritizes natural text boun daries : section>paragraph> sentence
*   It only splits further when necessary to respect the size constants
*   Compared to fixed-sized chuning it avoids breaking words mid-way and produces more coherant chunks






In [None]:
import nltk
from tqdm.auto import tqdm
nltk.download("punkt")

def recursive_chunk_text(text: str, max_chunk_size: int = 1000, min_chunk_size: int = 100) -> list:
    """
    Recursively splits a block of text into chunks that  fit within size constarints.
    tries splitting by section, then newlines, then sentences
    """
    def split_chunk(chunk: str) -> list:
        #Base case
        if len(chunk) <= max_chunk_size:
            return [chunk]

        #try splitting by double newlines
        sections = chunk.split("\n\n") # chunk by double new lines
        if len(sections) > 1:
            result = []
            for section in sections:
                if section.strip():
                    result.extend(split_chunk(section.strip()))
            return result

        #try splitting by single newline
        sections = chunk.split("\n") #2nd recurssion is single new line
        if len(sections) > 1:
            result = []
            for section in sections:
                if section.strip():
                    result.extend(split_chunk(section.strip()))
            return result

        #Fall back: try splitting by sentences
        sentences = nltk.sent_tokenize(chunk) # final recursion is sentence
        chunks, current_chunk, current_size = [], [], 0

        for sentence in sentences:
            if current_size + len(sentence) > max_chunk_size:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_size = len(sentence)
            else:
                current_chunk.append(sentence)
                current_size += len(sentence)
        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    return split_chunk(text)

def recursive_chunk_pdf_pages(pages_and_texts: list, max_chunk_size: int = 1000, min_chunk_size: int = 100) -> list[dict]:

    """
    Takes pdf pages with text and splits then into recursive chunks.

    Returns a list of dicts with page_number, chunk_index, and chunk_text.
    """

    all_chunks = []

    for page in tqdm(pages_and_texts, desc="Recursive chunking pages"):
        page_text = page["text"]
        page_number = page["page_number"]

        chunks = recursive_chunk_text(page_text, max_chunk_size=max_chunk_size, min_chunk_size=min_chunk_size)

        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "page_number": page_number,
                "chunk_index": i,
                "chunk_char_count": len(chunk),
                "chunk_word_count": len(chunk.split()),
                "chunk_token_count": len(chunk) // 4,
                "chunk_text": chunk
            })
    return all_chunks

In [None]:
recursive_chunked_pages = recursive_chunk_pdf_pages(pages_and_texts, max_chunk_size=800, min_chunk_size=100)
print(f"Total recursive chunks: {len(recursive_chunked_pages)}")
print(f"First recursive chunk (page {recursive_chunked_pages[0]['page_number']}):")
print(recursive_chunked_pages[0]['chunk_text'][:200] + "...")


In [None]:
# Pretty-print 5 random RECURSIVE chunks (uses 'recursive_chunked_pages)
import random
import textwrap # Import necessary modules: 'random' for sampling, 'textwrap' for text formatting

#---Sampling & Pretty Prinnting---
def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
    """Evenly spaced anchors + random jitter -indices scattered across [0, n-1], """
    # Check if we need to select any indices. If k (number of indices) is zero or less, return an empty list.
    if k <= 0:
      return []
    # If we only need one index, pick a single random index from 0 to n-1.
    if k == 1:
      return [random.randrange(n)]
    # Calculate k evenly spaced anchor points across the range [0, n-1].
    anchors = [int(round(i * (n-1) / (k-1))) for i in range(k)]
    out, seen = [], set()
    radius = max(1, int(n * jitter_frac))

    # Iterate through each anchor point to add random jitter.
    for a in anchors:
        # Define the lower (lo) and upper (hi) bounds for the random jitter, constrained by [0, n-1].
        lo, hi = max(0, a- radius), min(n-1, a+ radius)
        j = random.randint(lo, hi)

        # If the jittered index 'j' hasn't been used yet, add it to the output and the 'seen' set.
        if j not in seen:
            out.append(j); seen.add(j)
    # If the process above resulted in fewer than k unique indices (due to jitter overlap), fill the rest randomly.
    while len(out) < k:
        r = random.randrange(n) # Pick a random index from the whole range [0, n-1].
        # If the index hasn't been used, add it to the output.
        if r not in seen:
            out.append(r); seen.add(r)
    return out

def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
    header = (
        f"Page p{c['page_number']} . idx {c['chunk_index']} | "
        f"chunk {c['chunk_char_count']} . words {c['chunk_word_count']} . ~tokens {c['chunk_token_count']} "
    )
    #wrap body text, avoid breaking long words awkwardly
    wrapped_lines = textwrap.wrap(c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
                                  )
    context_width = max( [0, *map(len, wrapped_lines)])
    box_width = max(len(header), context_width + 2) # +2 for side padding

    # Transcribed code starts here:
    top = "╔" + "=" * box_width + "╗"
    hline = "║ " + header.ljust(box_width) + " ║"
    sep = "╠" + "-" * box_width + "╣"
    body = "\n".join(
        "║ " + line.ljust(box_width - 2) + " ║" for line in wrapped_lines) or ("║ " + "".ljust(box_width- 2) + " ║")
    bottom = "╚" + "=" * box_width + "╝"
    return "\n".join([top, hline, sep, body, bottom])
def show_random_recursive_chunks(recursive_chunked_pages: list[dict], k: int = 5, seed: int | None = 42):
    if seed is not None:
       random.seed(seed)
    n = len(recursive_chunked_pages)
    #if n == 0:
        #print("No semantic chunks to display.");
        #return
    assert n > 0, "No recursive chunks to display. Did you run the recursive chunking cell?"
    idxs = _scattered_indices(n, k)
    print(f"Showing {len(idxs)} scattered random RECURSINE chunks out of {n} total:\n")
    for i, idx in enumerate(idxs, 1):
        print(f"#{i}")
        print(_draw_boxed_chunk(recursive_chunked_pages[idx]))
        print()

# run (expects youo have already created semantic chunked pages)
assert 'recursive_chunked_pages' in globals() and len(recursive_chunked_pages) > 0, \
 "Run your recursive chunking code first to define 'recursive_chunked_pages'."
show_random_recursive_chunks(recursive_chunked_pages, k=5, seed=42)



**Chunking strategy 4 : Document structure based chunking**
How structure based chunking works?


*   The function looks for headers such as chapter numbers(eg. chapter 1) and section heading(1.1. Introduction)
*   Every time it finds a header, it starts a new chunk


*   Text is grouped with its closest heading until the nex heading is reached or the chunk size is exceeded
*   This preserves the logical flow of a textbook, where content under each heading remains together.

**Structure based chunking** is usefull for documents with a clear hierarchy(chapters,sections, subsections,). Unlike fixed-size chunking, it ensures that text remains tied to its heading, improving coherence and preserving semantic meaning.





In [None]:
#--- chapter -based chunking (simple & fast)----
# Assumes you have already run your base pdf so 'pages_and_texts' exists.
#we detect a new chapter whenever a page contains "University of Hawai" header.
# Each chapter = pages from one header until the p[age before the next header


import re  # regexr for text matching. ! which is regular expression
import random
import textwrap

# 1) helper to detect "chapter start" pages
def _is_chater_header_page(text: str) -> bool:
    # Robust to punctuations/diacritiics differences; matches the recurring header
    # e,g "University of Hawai I at Manda food science and human nutriyion program"
    return re.search(r"University\s+of\s+hawai", text, flags=re.IGNORECASE) is not None

def _guess_title_from_header(header: str) -> str:
    """
    Best-effort chapter title guess = the text before the 'University of Hawai' header line.
    Falls back to the first ~120 character.
    """
    m = re.search(r"University\s+of\s+hawai", header, flags=re.IGNORECASE)
    if m:
        title = header[:m.start()].strip()
        #keep it readable
        title = re.sub(r"\s+", " ", title).strip()
        if 10 <= len(title) <= 180:
            return title
    #fallback
    t = re.sub(r"\s+", " ", header).strip()
    return t[:120] if t else "Untitled chapter"

# 2) Biuld chapter chunks
def chapter_chunk_pdf_pages(pages_and_texts: list[dict]) -> list[dict]:
    """
    Returns a list of chapter chunks:
    [
        'chapter_index' int,
        'title' str,
        'page_end': int,
        'chunk_char_count': int,
        'chunk_word_count': int,
        'chunk_token_count': float,
        'chunk_text': str
    ]
    """
    if not pages_and_texts:
        return []

    # Find all page indices that look like the start of a chapter
    chapter_starts = []
    for i, p in enumerate(pages_and_texts):
        txt = p["text"]
        if _is_chater_header_page(txt):
            chapter_starts.append(i)

    # If nothing detected, return empty (or treat entire doc as one chunk)
    if not chapter_starts:
        #treat entire doc as one "chapter"
        all_text =" ".join(p["text"] for p in pages_and_texts).strip()
        return [{
            "chapter_index": 0,
            "title": _guess_title_from_header(pages_and_texts[0]["text"]),
            "page_start": pages_and_texts[0]["page_number"],
            "page_end": pages_and_texts[- 1]["page_number"],
            "chunk_char_count": len(all_text),
            "chunk_word_count": len(all_text.split()),
            "chunk_token_count": round(len(all_text) / 4, 2),
            "chunk_text": all_text
        }]

    #Build chapter ranges (start -> next_start-1)
    # where ever this "University\s+of\s+hawai" apperas it will be start of chunk and end of the chunk
    chapter_chunks = []
    for ci, s in enumerate(chapter_starts):
        e = (chapter_starts[ci + 1]-1) if (ci + 1 < len(chapter_starts)) else len(pages_and_texts)-1
        if e < 5:
            continue #gaurd (should n't happend)

        pages = pages_and_texts[s:e + 1]
        text_concat = " ".join(p["text"] for p in pages).strip()
        title = _guess_title_from_header(pages[0]["text"])

        chapter_chunks.append({
            "chapter_index": ci,
            "title": title,
            "page_start": pages[0]["page_number"],
            "page_end": pages[-1]["page_number"],
            "chunk_char_count": len(text_concat),
            "chunk_word_count": len(text_concat.split()),
            "chunk_token_count": round(len(text_concat) / 4, 2),
            "chunk_text": text_concat
        })
    return chapter_chunks


#

In [None]:
structure_chunked_pages = chapter_chunk_pdf_pages(pages_and_texts)

print(f"Total chpater-based chunks: {len(structure_chunked_pages)}")
if structure_chunked_pages:
    first = structure_chunked_pages[0]
    print(f"First chapter (pages {first['page_start']}-{first['page_end']}): {first['title']}")
    print(first['chunk_text'][:200] +"...")
else:
    print("No chapter-based chunks detected.")

In [None]:
def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
    header = (
        f"Page p{c.get('page_start', 'N/A')}-{c.get('page_end', 'N/A')} . idx {c.get('chapter_index', 'N/A')} | " # Use get with default for robustness
        f"chunk {c['chunk_char_count']} . words {c['chunk_word_count']} . ~tokens {c['chunk_token_count']} "
    )
    #wrap body text, avoid breaking long words awkwardly
    wrapped_lines = textwrap.wrap(c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
                                  )
    context_width = max( [0, *map(len, wrapped_lines)])
    box_width = max(len(header), context_width + 2) # +2 for side padding

    # Transcribed code starts here:
    top = "╔" + "=" * box_width + "╗"
    hline = "║ " + header.ljust(box_width) + " ║"
    sep = "╠" + "-" * box_width + "╣"
    body = "\n".join(
        "║ " + line.ljust(box_width - 2) + " ║" for line in wrapped_lines) or ("║ " + "".ljust(box_width- 2) + " ║")
    bottom = "╚" + "=" * box_width + "╝"
    return "\n".join([top, hline, sep, body, bottom])
def show_random_chapter_chunks(chapter_chunks: list[dict], k: int = 5, seed: int | None = 42):
    if not chapter_chunks:
        print("No chapter-based chunks to display.");
        return
    if seed is not None:
       random.seed(seed)
    #n = len(recursive_chunked_pages)
    #if n == 0:
        #print("No semantic chunks to display.");
        #return
    #assert n > 0, "No recursive chunks to display. Did you run the recursive chunking cell?"
    k = min(k, len(chapter_chunks))
    idxs = random.sample(range(len(chapter_chunks)), k)
    print(f"Showing {len(idxs)} random chapter chunks out of {len(chapter_chunks)} total:\n")
    for i, idx in enumerate(idxs, 1):
        print(f"#{i}")
        print(_draw_boxed_chunk(chapter_chunks[idx]))
        print()

# 4) run (expects youo have already created semantic chunked pages)
assert 'pages_and_texts' in globals(), "Run your base pdf code first to define pages_and_texts"
chapter_chunks = chapter_chunk_pdf_pages(pages_and_texts)
print(f"Total chapters detected: {len(chapter_chunks)}")
if chapter_chunks:
    print(f"First chapter: {chapter_chunks[0]['title']} (p{chapter_chunks[0]['page_start']}-{chapter_chunks[0]['page_end']})")

#Inspects a few
show_random_chapter_chunks(chapter_chunks, k=5, seed=21)

**Chunking Strategy 5: LLM based chunking**
This chuning strategy uses an LLM to create semantically coherant chunks by understanding context and maintaining thematic consistency through natural language processing.
Here we use an API key.