# Chunking

flow
1. Retrieve PDFs
2. Chunk

## 1. Retrieve PDFs

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

BASE_URL = "https://www.ph.emb-japan.go.jp/itpr_en/00_000035.html"
output_dir = "data/raw_pdfs"
os.makedirs(output_dir, exist_ok=True)

headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36"}

# Fetch the page
response = requests.get(BASE_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")


# Find all PDF links
pdf_links = [urljoin(BASE_URL, a['href']) 
             for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]

print(f"Found {len(pdf_links)} PDFs")

# Download each PDF
for link in pdf_links:
    filename = link.split("/")[-1]
    filepath = os.path.join(output_dir, filename)
    print(f"Downloading {filename}")
    pdf_data = requests.get(link, headers=headers)
    with open(filepath, "wb") as f:
        f.write(pdf_data.content)

print("✅ Done!")


: 

In [None]:
import pymupdf
import os
import json

pdf_dir = "data/raw_pdfs"  # folder containing your PDFs
pdf_file = os.listdir(pdf_dir)[0]
print(pdf_file)
print()

pdf_path = os.path.join(pdf_dir, pdf_file)
with pymupdf.open(pdf_path) as doc:
    print(doc)
    print(f'#page: {doc.page_count}')
    print(f'metadata: {json.dumps(doc.metadata, indent=2)}')

In [None]:
import pymupdf

doc = pymupdf.open(pdf_path) # open a document
for page in doc:
    print(page.get_text())
    # print(page.get_links())


In [None]:
len(doc)

In [None]:
import fitz  # PyMuPDF
import os

pdf_dir = "data/raw_pdfs"  # folder containing your PDFs

# Loop through all PDFs
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        
        # Open PDF
        with fitz.open(pdf_path) as doc:
            # Try to read document metadata title first
            title = doc.metadata.get("title", "")
            
            if not title:
                # Fallback: read the first page text and take the first line as title
                first_page_text = doc[0].get_text("text").strip()
                title = first_page_text.split("\n")[0] if first_page_text else "No title found"
            
            print(f"📄 {pdf_file} → Title: {title}")


## 2. Chunking (Section-Based)

In [5]:
import fitz  # PyMuPDF
import os
import re
import json

pdf_dir = "data/raw_pdfs"
output_file = "pdf_chunks.json"

def section_based_chunking(text, max_items=10):
    """
    Groups related lines into bigger chunks (max_items = how many requirement items to group).
    """
    text = text.replace("\r", "\n")
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    
    chunks = []
    current_chunk = ""
    item_count = 0
    
    for line in lines:
        # Start of major heading (A. PURPOSE, B. REQUIREMENTS)
        if re.match(r"^[A-Z]\.\s", line):
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            current_chunk = line
        
        # Numbered item (1), (2), (3)
        elif re.match(r"^\(\d+\)", line):
            if item_count >= max_items:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            
            current_chunk += "\n" + line
            item_count += 1
        
        # Special headings 【ADDITIONAL REQUIREMENTS】
        elif line.startswith("【") and line.endswith("】"):
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            current_chunk = line
        
        else:
            # Add bullets, sub-text, or anything else
            current_chunk += "\n" + line
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


data = []

# Loop through PDFs
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"📄 Processing: {pdf_file}")
        
        with fitz.open(pdf_path) as doc:
            title = doc.metadata.get("title", "")
            
            text = ""
            for page in doc:
                text += page.get_text("text") + "\n"
            
            if not title and text.strip():
                title = text.split("\n")[0]
            
            # Chunk with wider grouping
            chunks = section_based_chunking(text, max_items=10)
            
            for idx, chunk in enumerate(chunks):
                data.append({
                    "file_name": pdf_file,
                    "title": title.strip(),
                    "chunk_id": f"{pdf_file}_chunk_{idx}",
                    "content": chunk
                })

# Save JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"✅ Created {len(data)} section-based chunks → saved to {output_file}")


📄 Processing: 100479463.pdf
📄 Processing: 100365628.pdf
📄 Processing: this-is-a-message-to-you-EN.pdf
📄 Processing: 100508288.pdf
📄 Processing: 100508289.pdf
📄 Processing: 100325630.pdf
📄 Processing: 100325631.pdf
📄 Processing: 000308386.pdf
📄 Processing: 100585068.pdf
📄 Processing: 100401397.pdf
📄 Processing: 100475176.pdf
📄 Processing: 100415047.pdf
📄 Processing: 100415046.pdf
📄 Processing: 100662512.pdf
📄 Processing: 100404404.pdf
📄 Processing: 100415048.pdf
📄 Processing: 100475146.pdf
📄 Processing: 100365641.pdf
📄 Processing: 100825869.pdf
📄 Processing: 100585059.pdf
📄 Processing: 100674192.pdf
📄 Processing: 100480646.pdf
📄 Processing: 100365634.pdf
📄 Processing: 100325628.pdf
📄 Processing: 100508287.pdf
📄 Processing: 100365635.pdf
📄 Processing: 100365623.pdf
📄 Processing: 100401465.pdf
📄 Processing: 100508284.pdf
📄 Processing: 100508285.pdf
📄 Processing: 100365636.pdf
📄 Processing: 100365632.pdf
📄 Processing: 100365626.pdf
📄 Processing: 100324691.pdf
📄 Processing: 100508281.pdf
📄 