# OCR PDF Extractor
This notebook extracts tables from PDF files using PaddleOCR (Hindi + English support).

## Workflow:
1. Convert PDF pages to images
2. Run OCR on each image
3. Reconstruct tables from OCR results
4. Export to Excel and CSV

## 1. Import Libraries

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


## 2. Configuration
Adjust these settings as needed:

In [2]:
# ================== CONFIG ==================
PDF_PATH = "input.pdf"
IMG_DIR = "output/images"
EXCEL_OUT = "output/tables.xlsx"
CSV_OUT = "output/tables.csv"
DPI = 300
ROW_Y_THRESHOLD = 25   # controls row grouping
# ============================================

# Create output directories
os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs("output", exist_ok=True)

print("‚úÖ Configuration set and directories created")

‚úÖ Configuration set and directories created


## 3. Initialize OCR Model
This will download the Hindi + English models if not already cached.

In [3]:
print("üîπ Initializing OCR model (Hindi + English)...")
ocr = PaddleOCR(
    lang="hi",
    use_textline_orientation=True
)
print("‚úÖ OCR model initialized")

üîπ Initializing OCR model (Hindi + English)...


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\anubh\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\anubh\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\anubh\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\anubh\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('devanagari_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Usin

‚úÖ OCR model initialized


## 4. Convert PDF to Images

In [4]:
print("üìÑ Converting PDF to images...")
pages = convert_from_path(
    PDF_PATH,
    dpi=DPI,
    poppler_path=r"C:\poppler-25.12.0\Library\bin"
)

image_paths = []
for i, page in enumerate(pages):
    path = f"{IMG_DIR}/page_{i+1}.png"
    page.save(path, "PNG")
    image_paths.append(path)

print(f"‚úÖ {len(image_paths)} pages converted")
print(f"üìÅ Images saved to: {IMG_DIR}")

üìÑ Converting PDF to images...


KeyboardInterrupt: 

## 5. Preview First Page (Optional)
Uncomment to display the first converted page:

In [None]:
# from IPython.display import Image, display
# if image_paths:
#     display(Image(filename=image_paths[0]))

## 6. OCR Processing & Table Reconstruction
Process each page (skipping the first page) and reconstruct tables:

In [2]:
all_rows = []

for img_path in tqdm(image_paths[1:], desc="üîç OCR Processing"):
    result = ocr.predict(img_path)

    if not result or not result[0]:
        continue

    rows = {}

    # PaddleOCR predict returns: [[box, (text, confidence)], ...]
    for item in result[0]:
        if not item or len(item) < 2:
            continue
            
        box = item[0]  # Bounding box coordinates
        text_info = item[1]  # (text, confidence) tuple
        
        # Extract text from tuple
        text = text_info[0] if isinstance(text_info, tuple) else text_info

        x = int(box[0][0])
        y = int(box[0][1])

        row_key = y // ROW_Y_THRESHOLD
        rows.setdefault(row_key, []).append((x, text))

    for r in rows.values():
        r.sort(key=lambda x: x[0])
        row_text = [cell[1] for cell in r]

        if len(row_text) >= 4:
            all_rows.append(row_text)

print(f"‚úÖ Extracted {len(all_rows)} rows from {len(image_paths)-1} pages")

NameError: name 'tqdm' is not defined

## 7. Normalize Column Count
Ensure all rows have the same number of columns:

In [None]:
if all_rows:
    max_cols = max(len(r) for r in all_rows)
    print(f"üìä Maximum columns detected: {max_cols}")

    normalized = []
    for row in all_rows:
        row += [""] * (max_cols - len(row))
        normalized.append(row)

    df = pd.DataFrame(normalized)
    print(f"‚úÖ Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
else:
    df = pd.DataFrame()
    print("‚ö†Ô∏è No rows extracted")

## 8. Preview Results
Display the first few rows of the extracted table:

In [None]:
if not df.empty:
    display(df.head(10))
else:
    print("No data to display")

## 9. Save Output Files
Export to Excel and CSV:

In [None]:
if not df.empty:
    df.to_excel(EXCEL_OUT, index=False, header=False)
    df.to_csv(CSV_OUT, index=False, header=False)

    print("\nüéâ DONE!")
    print(f"üìä Excel saved ‚Üí {EXCEL_OUT}")
    print(f"üìÑ CSV saved   ‚Üí {CSV_OUT}")
else:
    print("‚ö†Ô∏è No data to save")

## 10. Statistics & Summary

In [None]:
if not df.empty:
    print("üìà Summary Statistics:")
    print(f"   ‚Ä¢ Total pages processed: {len(image_paths)-1}")
    print(f"   ‚Ä¢ Total rows extracted: {len(df)}")
    print(f"   ‚Ä¢ Total columns: {len(df.columns)}")
    print(f"   ‚Ä¢ Non-empty cells: {df.astype(bool).sum().sum()}")
    print(f"   ‚Ä¢ Empty cells: {(df == '').sum().sum()}")