In [None]:
import os
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
 
import warnings
warnings.filterwarnings('ignore')
 
import json
import numpy as np
import pytesseract
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
from paddleocr import PaddleOCR
 
 
 
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
 
PDF_PATH = r"C:\Users\Roshini.T\Downloads\0149_OV.pdf"
POPPLER_PATH = r"C:\poppler-22.12.0\Library\bin"
 
OUTPUT_DIR = "outputimage"
OUTPUT_TEXT_FILE = "pagesfinal.txt"
DPI = 300
TESS_CONFIG = r"--oem 3 --psm 6"
 
 
ocr = PaddleOCR(
    use_angle_cls=True,
    lang="en"
)
 
def pdf_to_images(pdf_path):
    return convert_from_path(
        pdf_path,
        dpi=DPI,
        poppler_path=POPPLER_PATH
    )
 
 
def extract_ocr_words(image):
    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    words = []
 
    for i in range(len(data["text"])):
        txt = data["text"][i].strip()
        if txt:
            words.append({
                "text": txt,
                "x0": data["left"][i],
                "y0": data["top"][i],
                "x1": data["left"][i] + data["width"][i],
                "y1": data["top"][i] + data["height"][i],
                "block": data["block_num"][i]
            })
    return words
 
 
def detect_columns(words, image_width):
    if len(words) < 10:
        return False, image_width / 2
 
    x_positions = sorted(set(
        [w["x0"] for w in words] + [w["x1"] for w in words]
    ))
 
    max_gap = 0
    boundary = image_width / 2
 
    for i in range(len(x_positions) - 1):
        gap = x_positions[i + 1] - x_positions[i]
        center = (x_positions[i] + x_positions[i + 1]) / 2
 
        if image_width * 0.25 < center < image_width * 0.75:
            if gap > max_gap:
                max_gap = gap
                boundary = center
 
    return max_gap > image_width * 0.08, boundary
 
 
def group_by_blocks(words):
    blocks = {}
 
    for w in words:
        blocks.setdefault(w["block"], []).append(w)
 
    sections = []
 
    for block_words in blocks.values():
        block_words.sort(key=lambda w: (w["y0"], w["x0"]))
        sections.append({
            "text": " ".join(w["text"] for w in block_words),
            "bbox": {
                "x0": min(w["x0"] for w in block_words),
                "y0": min(w["y0"] for w in block_words),
                "x1": max(w["x1"] for w in block_words),
                "y1": max(w["y1"] for w in block_words),
            }
        })
 
    return sorted(sections, key=lambda s: (s["bbox"]["y0"], s["bbox"]["x0"]))
 
 
def extract_sections(image):
    words = extract_ocr_words(image)
    if not words:
        return []
 
    has_cols, boundary = detect_columns(words, image.width)
 
    sections = []
 
    # if has_cols:
    #     left = [w for w in words if (w["x0"] + w["x1"]) / 2 < boundary]
    #     right = [w for w in words if (w["x0"] + w["x1"]) / 2 >= boundary]
 
    #     for s in group_by_blocks(left):
    #         s["column"] = "left"
    #         sections.append(s)
 
    #     for s in group_by_blocks(right):
    #         s["column"] = "right"
    #         sections.append(s)
    # else:
    for s in group_by_blocks(words):
            s["column"] = "full"
            sections.append(s)
 
    for i, s in enumerate(sections, 1):
        s["section_id"] = i
 
    return sections
 
 
def draw_boxes(image, sections):
    draw = ImageDraw.Draw(image)
 
    for s in sections:
        b = s["bbox"]
        color = {"left": "red", "right": "blue", "full": "green"}[s["column"]]
        draw.rectangle([(b["x0"], b["y0"]), (b["x1"], b["y1"])], outline=color, width=2)
        draw.text((b["x0"], b["y0"] - 12), f"S{s['section_id']}", fill=color)
 
    return image
 
 
def save_section_images(image, sections, page_num):
    # for page_num, im in enumerate(image,start=1):
       
    #     if page_num in (3,7):
            # print(f"OCR running on page {page_num} (PSM 6)...")
            base = os.path.join(OUTPUT_DIR, "bound_images", f"page_{page_num}")
            os.makedirs(base, exist_ok=True)
            all_text=[]
            all_text.append(f"\n===== PAGE {page_num} =====\n")
           
            for s in sections:
                b = s["bbox"]
                crop = image.crop((b["x0"], b["y0"], b["x1"], b["y1"]))
                # IMAGE SAVE
                crop.save(os.path.join(base, f"section_{s['section_id']:03d}.png"))
                # TEXT EXTRACT
                s["vision_text"] = pytesseract.image_to_string(crop,config="--oem 3 --psm 6").strip()
               
                all_text.append(s["vision_text"])
            return all_text
           
            print(f"✅ Text extracted using Tesseract PSM 6 → {OUTPUT_TEXT_FILE}")
 
 
def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
 
    images = pdf_to_images(PDF_PATH)
    output = {}
    boxed_pages = []
 
    for page_num, image in enumerate(images, start=1):
     print(f"Processing page {page_num}...")
    sections = extract_sections(image)
    boxed_pages.append(draw_boxes(image.copy(), sections))
    out = save_section_images(image, sections, page_num)
 
    with open(OUTPUT_TEXT_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(out))
 
    output[f"page_{page_num}"] = sections
 
    with open(os.path.join(OUTPUT_DIR, "sections.json"), "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4, ensure_ascii=False)
 
    if boxed_pages:
        boxed_pages[0].save(
            os.path.join(OUTPUT_DIR, "bboxes_all.pdf"),
            save_all=True,
            append_images=boxed_pages[1:]
        )
 
    # extract_pages_3_to_6_text()
    print("✅ ALL DONE SUCCESSFULLY")
 
 
if __name__ == "__main__":
    main()
 