In [33]:
import fitz
from tqdm.auto import tqdm 


def text_formatter(text:str)->str:
    clean_text = text.replace("\n" , " ").strip()
    return clean_text
def open_read_pdf(pdf_path:str) -> str : 
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_num , page in tqdm(enumerate(doc)) : 
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number":page_num,
                                "page_char_count":len(text),
                                "page_word_count":len(text.split(" ")),
                                "page_sentence_count_raw":len(text.split('. ')),
                                "page_token_count":len(text)/4 ,# 1 token ~ 4 chars
                                "text":text})
        
    return pages_and_texts 

In [34]:
pdf_path="Boeing B737 Manual.pdf"

pages_and_texts = open_read_pdf(pdf_path=pdf_path)
pages_and_texts[50]

146it [00:00, 225.60it/s]


{'page_number': 50,
 'page_char_count': 994,
 'page_word_count': 195,
 'page_sentence_count_raw': 3,
 'page_token_count': 248.5,
 'text': 'Boeing 737 Operations Manual    Normal Procedures Chapter NP Flight Patterns Section 30     Copyright © The Boeing Company. See title page for details. D6-27370-TBC NP.30.1 NP.30 Normal Procedures-Flight Patterns Takeoff   VR • Rotate  3000 feet  • One or 2 engine   Thrust set • Manually advance Positive rate of climb  • Gear up flap retraction altitude  • Select flaps up maneuvering speed  • Set/verify climb thrust (2 engine) • • • • thrust to stabilize • Press TO/GA  Acceleration height  At 400 feet AGL • Select roll mode  • VNAV engaged • • • LNAV armed (as required) (normally 1000 ft.)  (as installed)  • Retract flaps on schedule  climb speed V1 • Takeoff thrust by 60 knots  • V2+15 to 25 knots (2 engine)  • V2 to V2+20 knots (1 engine) Flaps up • Maintain flaps up maneuvering speed  • Set max continuous thrust (1 engine) • After Takeoff checkli

In [22]:
import pandas as pd 
df  = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,2655,883,655,663.75,Boeing 737 Operations Manual Normal Procedur...
1,1,1687,623,513,421.75,Boeing 737 Operations Manual Copyright © The...
2,2,2225,320,19,556.25,Boeing 737 Operations Manual Normal Procedu...
3,3,2269,350,15,567.25,Boeing 737 Operations Manual Normal Procedu...
4,4,1346,216,11,336.5,Boeing 737 Operations Manual Normal Procedu...


In [23]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,146.0,146.0,146.0,146.0,146.0
mean,72.5,1533.705479,260.753425,36.589041,383.42637
std,42.290661,654.736495,168.919428,125.344033,163.684124
min,0.0,181.0,28.0,3.0,45.25
25%,36.25,958.5,161.0,6.0,239.625
50%,72.5,1525.5,211.5,11.5,381.375
75%,108.75,2034.5,302.5,16.0,508.625
max,145.0,2916.0,963.0,729.0,729.0


In [24]:
def chunk_text(text:str,chunk_size:int=500) -> list:
    chunks = []
    current_chunk = ''
    words = text.split()
    for word in words : 
        if len(current_chunk) + len(word) + 1 <= chunk_size : 
            current_chunk += (word + " ")
        else : 
            chunks.append(current_chunk)
            current_chunk = word + " "
    ## adding the last chunk if it is not empty    
    if current_chunk : 
        chunks.append(current_chunk.strip())
    return chunks     


def chunks_pdf_pages(pages_and_texts:list,chunk_size:int=500)->list[dict]:
    all_chunks = []
    for page in pages_and_texts : 
        page_number = page["page_number"]
        page_text = page["text"]
        
        chunks = chunk_text(page_text,chunk_size=chunk_size)
        for i,chunk in enumerate(chunks) : 
            all_chunks.append({
                "page_number":page_number,
                "chunk_index" : i,
                "chunk_char_count":  len(chunk),
                "chunk_word_count" : len(chunk.split()),
                "chunk_token_count": len(chunk)/4,
                "chunk_text":chunk
            })
    return all_chunks

In [25]:
chunked_pages = chunks_pdf_pages(pages_and_texts,chunk_size=500)
print(f"Total chunks: {len(chunked_pages)}")
print(f"Chunk sample (page {chunked_pages[200]["page_number"]}) : {chunked_pages[200]["chunk_text"][:200]}")

Total chunks: 522
Chunk sample (page 57) : Boeing 737 Operations Manual Supplementary Procedures - Fuel Copyright © The Boeing Company. See title page for details. SP.12.2 D6-27370-TBC Refueling Fuel Load Distribution Main tanks No. 1 and No. 


## Fixed-Size Chunking 

In [32]:
import random,textwrap 
# -------------------------Sampling & Pretty Printing--------------------------
def _scattered_indices(n:int,k:int,jitter_frac:float=0.08) -> list[int] : 
    if k<=0 :
        return []
    if k==1 : 
        return [random.randrange(n)]
    anchors = [int(round(i*(n-1)/(k-1))) for i in range(k)]
    out,seen = [],set()
    radius = max(1,int(n*jitter_frac))
    for a in anchors : 
        lo,hi = max(0,a-radius) , min(n-1,a+radius)
        j = random.randint(lo,hi)
        if j not in seen : 
            out.append(j);seen.add(j)
    while len(out)<k : 
        r = random.randrange(n)
        if r not in seen :
            out.append(r),seen.add(r)
    return out

def _draw_boxed_chunk(c:dict,wrap_at:int=96) -> str : 
    header = (
        f"Chunk p{c['page_number']} -- idx {c['chunk_index']}  |  "
        f"Chars {c['chunk_char_count']} -- words {c['chunk_word_count']} -- ~tokens {c['chunk_token_count']} "
    )
    
    wrapped_lines = textwrap.wrap(
        c['chunk_text'],width=wrap_at,break_long_words=False,replace_whitespace=False
    )
    context_width = max([0,*map(len,wrapped_lines)])
    box_width = max(len(header),context_width+2)
    
    top    = "=" + (box_width+4)*"=" + "="
    hline  = "||" + header.ljust(box_width) + "  ||"
    sep    = "||-" + "-"*box_width + "-||"
    body   = "\n".join("||" + line.ljust(box_width--2)+'||' for line in wrapped_lines) or \
        ("||"+"".ljust(box_width-2) + "||")
    bottom = "=" + (box_width+4)*"=" + "="
    
    return "\n".join([top,hline,sep,body,bottom])

def show_random_chunks(pages_and_texts,chunk_size:int=500,k:int=5,seed:int | None = 42) : 
    if seed is not None : 
        random.seed(seed)
    all_chunks = chunks_pdf_pages(pages_and_texts,chunk_size)
    if not all_chunks : 
        print(f"No chunks are available.")
        return
    idxs = _scattered_indices(len(all_chunks),k)
    print(f"Showing {len(idxs)} scattered random chunks ou of {len(all_chunks)} in total:\n ")
    for i ,idx in enumerate(idxs,1):
        print(f"#{i}")
        print(_draw_boxed_chunk(all_chunks[idx]))
        print()

In [27]:
assert 'pages_and_texts' in globals() , "Run : pages_and_texts = open_and_read_pdf(odf_path) first."
show_random_chunks(pages_and_texts,chunk_size=500,k=2,seed=42)

Showing 2 scattered random chunks ou of 522 in total:
 
#1
||Chunk p11 -- idx 1  |  Chars 493 -- words 53 -- ~tokens 123.25                                     ||
||---------------------------------------------------------------------------------------------------||
||....................................................................................Test FLIGHT    ||
||RECORDER OFF light – Illuminated FLIGHT RECORDER test switch – TEST FLIGHT RECORDER OFF light –    ||
||................................................ Push Verify control column vibration when each    ||
||switch is pushed. Note: The stall                                                                  ||

#2
||Chunk p133 -- idx 4  |  Chars 212 -- words 35 -- ~tokens 53.0                                    ||
||-------------------------------------------------------------------------------------------------||
||ceiling to locate the exits and provide general illumination in the area of the exits.           ||
||Self–

## LLM-based Chunking

In [28]:
import requests
from typing import List, Dict

HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct"
HF_HEADERS = {"Authorization": " ## Access Token ##"}  # paste your token here

def llm_based_chunking(text: str, chunk_size: int = 1000) -> List[Dict]:
    """
    Using Hugging Face hosted LLM to find semantically coherent chunk boundaries
    given a target chunk size.
    """
    
    def get_chunk_boundary(text_segment:str) -> int : 
        prompt = f"""
            Analyze the following text and identify the best point to split it 
            into two semantically coherent parts.
            The split should occur near {chunk_size} characters 
            Text:
            \"\"\"{text_segment}"
            Return only the integer index (character position) within this text
            where the split should occur.Do not return any explanation.
            """
            
        payload = {"inputs": prompt, "parameters": {"max_new_tokens": 20}}
        response = requests.post(HF_API_URL, headers=HF_HEADERS, json=payload)
        result = response.json()

        if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
            split_str = result[0]["generated_text"].strip()
        else:
            split_str = str(chunk_size)

        try:
            split_point = int(split_str)
        except ValueError:
            split_point = chunk_size

        return split_point
    chunks = []
    remaining_text = text 
    while len(remaining_text) > chunk_size : 
        text_window = remaining_text[:chunk_size]
        split_point  = get_chunk_boundary(text_window)
        if split_point < 100 or split_point> len(text_window)-100 : 
            split_point = chunk_size
        chunks.append(remaining_text[:split_point].strip())
        remaining_text = remaining_text[split_point:].strip()
    if remaining_text:
        chunks.append(remaining_text)
    return chunks

In [29]:
def llm_based_chunk_pdf_pages(pages_and_texts:List[Dict],chunk_size:int=1000) -> List[Dict] : 
    all_chunks = []
    for page in tqdm(pages_and_texts,desc="LLM based chunking pages") : 
        page_number = page["page_number"]
        page_text = page["text"]
        
        chunks = llm_based_chunking(page_text,chunk_size)
        for i,chunk in enumerate(chunks) : 
            all_chunks.append({
                "page_number":page_number,
                "chunk_index" : i, 
                "chunk_char_count":  len(chunk),
                "chunk_word_count" : len(chunk.split()),
                "chunk_token_count": len(chunk)/4,
                "chunk_text":chunk
            })
    return all_chunks

In [None]:
llm_chunked_pages = llm_based_chunk_pdf_pages(pages_and_texts,1000)

In [3]:
import fitz # PyMuPDF
import os

def extract_images_from_pdf(pdf_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    pdf = fitz.open(pdf_path)
    for page_index, page in enumerate(pdf):
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list, start=1):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = os.path.join(output_dir, f"page{page_index+1}_img{img_index}.{image_ext}")
            with open(image_path, "wb") as f:
                f.write(image_bytes)
    print("✅ Images extracted successfully.")

# Example:
extract_images_from_pdf("Boeing B737 Manual.pdf", "images/")


✅ Images extracted successfully.


In [12]:
import fitz  # PyMuPDF
import os

def render_page_as_image(pdf_path, output_dir, zoom=2.0):
    os.makedirs(output_dir, exist_ok=True)
    pdf = fitz.open(pdf_path)

    for page_number, page in enumerate(pdf, start=1):
        # render page at higher resolution
        matrix = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=matrix, alpha=False)

        output_path = os.path.join(output_dir, f"page_{page_number}.png")
        pix.save(output_path)
    print(f"✅ Rendered {len(pdf)} pages as images.")

# Example
render_page_as_image("Boeing B737 Manual.pdf", "page_images")


✅ Rendered 146 pages as images.


In [5]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


FileNotFoundError: [Errno 2] No such file or directory: '/rendered_pages/page_110.png'

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [9]:
image = Image.open("rendered_pages/page_135.png")
inputs = processor(image, return_tensors="pt")
caption = model.generate(**inputs)
print(processor.decode(caption[0], skip_special_tokens=True))

a page from the book, ' do not for light '
