# Chunking

## Section-Based Level Chunking

In [12]:
text = "Line1\nLine2\rLine3\r\nLine4"

text = text.replace("\r\n", "\n").replace("\r", "\n") # cleaning
lines = [line.strip() for line in text.split("\n") if line.strip()]

In [13]:
text

'Line1\nLine2\nLine3\nLine4'

In [14]:
lines

['Line1', 'Line2', 'Line3', 'Line4']

In [15]:
import re
import pymupdf
import os
import json

pdf_dir = "data/raw_pdfs"
output_file = "pdf_chunks.json"

def section_based_chunking(text, max_items=10):
    """
    Groups related lines into bigger chunks (max_items = how many requirement items to group).
    """
    text = text.replace("\r\n", "\n").replace("\r", "\n") # cleaning
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    
    chunks = []
    current_chunk = ""
    item_count = 0
    
    for line in lines:
        # Start of major heading (A. PURPOSE, B. REQUIREMENTS)
        if re.match(r"^[A-Z]\.\s", line):
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            current_chunk = line
        
        # Numbered item (1), (2), (3)
        elif re.match(r"^\(\d+\)", line):
            if item_count >= max_items:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            
            current_chunk += "\n" + line
            item_count += 1
        
        # Special headings 【ADDITIONAL REQUIREMENTS】
        elif line.startswith("【") and line.endswith("】"):
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
                item_count = 0
            current_chunk = line
        
        else:
            # Add bullets, sub-text, or anything else
            current_chunk += "\n" + line
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks





In [16]:
data = []

# Loop through PDFs
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        print(f"📄 Processing: {pdf_file}")
        
        with pymupdf.open(pdf_path) as doc:
            title = doc.metadata.get("title", "")
            
            text = ""
            for page in doc:
                text += page.get_text("text") + "\n"
            
            if not title and text.strip():
                title = text.split("\n")[0]
            
            # Chunk with wider grouping
            chunks = section_based_chunking(text, max_items=10)
            
            for idx, chunk in enumerate(chunks):
                data.append({
                    "file_name": pdf_file,
                    "title": title.strip(),
                    "chunk_id": f"{pdf_file}_chunk_{idx}",
                    "content": chunk
                })

# Save JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"✅ Created {len(data)} section-based chunks → saved to {output_file}")

📄 Processing: tourism.pdf
✅ Created 6 section-based chunks → saved to pdf_chunks.json
