## Extracting Text Pagewise 

In [5]:
import fitz  # PyMuPDF
import re

def extract_full_pdf_distinctly(pdf_path):
    """
    Traverses the PDF page-wise and stores content in a list.
    Handles Requirement 6 (multi-page documents) and Requirement 17 (Pre-processing).
    """
    doc = fitz.open(pdf_path)
    pages_data = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # 'blocks' maintains structural flow for messy tables (Requirement 14)
        blocks = page.get_text("blocks")
        
        # Join blocks into a single string for the page
        raw_text = "\n".join([b[4] for b in blocks])
        
        # --- Pre-processing Engine (Requirement 17) ---
        # Normalize whitespace as required (Requirement 16)
        clean_text = re.sub(r'\s+', ' ', raw_text).strip()
        
        # Store as a dictionary for easy access (e.g., pages_data[0]['text'])
        pages_data.append({
            "page_number": page_num + 1,
            "content": clean_text
        })
            
    doc.close()
    return pages_data

# --- Implementation ---
pdf_path = "../Case diary - 999-2020.pdf"
full_case_data = extract_full_pdf_distinctly(pdf_path)

# Example: Accessing a specific page
# print(full_case_data[0]['content']) 

# Save to a text file with clear markers for debugging
with open("full_case_distinct.txt", "w", encoding="utf-8") as f:
    for page in full_case_data:
        f.write(f"--- START PAGE {page['page_number']} ---\n")
        f.write(page['content'] + "\n")
        f.write(f"--- END PAGE {page['page_number']} ---\n\n")

print(f"Traversed {len(full_case_data)} pages successfully.")

Traversed 28 pages successfully.
