In [1]:
# ! pip install transformers

In [4]:
import cv2
import pytesseract
import tabula
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import os

# Set Tesseract path (adjust for your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Preprocess image for better OCR
def preprocess_image(image):
    # Convert to grayscale if not already
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Increase contrast and sharpen
    image = cv2.convertScaleAbs(image, alpha=1.5, beta=0)
    image = cv2.GaussianBlur(image, (5, 5), 0)
    _, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return image

# Detect table boundaries (improved with larger region and debug output)
def detect_table(image):
    # Edge detection with adjusted parameters
    edges = cv2.Canny(image, 30, 100)
    
    # Dilate to connect table lines
    kernel = np.ones((5, 5), np.uint8)
    dilated = cv2.dilate(edges, kernel, iterations=2)  # Increased iterations
    
    # Find contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        print("No contours found, using full image")
        return image, (0, 0, image.shape[1], image.shape[0])
    
    # Get largest contour (table)
    max_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(max_contour)
    
    # Expand region significantly to ensure full table capture
    padding = 50  # Increased padding
    x, y, w, h = max(0, x-padding), max(0, y-padding), w+2*padding, h+2*padding
    
    # Ensure coordinates don’t exceed image bounds
    x, y = max(0, x), max(0, y)
    w, h = min(w, image.shape[1]-x), min(h, image.shape[0]-y)
    
    table_region = image[y:y+h, x:x+w]
    
    # Save cropped region for debugging
    cv2.imwrite("table_region_debug.png", table_region)
    print(f"Table detected at (x={x}, y={y}, w={w}, h={h}). Saved as 'table_region_debug.png'")
    
    return table_region, (x, y, w, h)

# Extract text with OCR, tuned for tables
def ocr_table(table_image):
    # Custom config: PSM 6 (block of text), preserve interword spaces
    custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
    text = pytesseract.image_to_string(table_image, config=custom_config)
    return text

# Convert OCR text to structured table (improved parsing)
def text_to_csv(text, output_file):
    lines = [line for line in text.split('\n') if line.strip()]
    
    # Debug: Print raw OCR text
    print(f"Raw OCR Text:\n{text}\n{'-'*50}")
    
    # Filter out title text and keep potential table rows
    table_lines = []
    for line in lines:
        if line.startswith('STANDARD NORMAL') or 'Z score' in line:
            continue
        # Relaxed condition: Look for lines with numbers, not just '|'
        if any(char.isdigit() for char in line):
            table_lines.append(line)
    
    if not table_lines:
        raise ValueError("No table rows detected in OCR output")
    
    # Process table lines
    data = []
    for line in table_lines:
        # Split by whitespace or '|' (more flexible)
        row = [cell.strip('$\\mathbf{').strip('}') for cell in line.replace('|', ' ').split() if cell.strip()]
        # Relaxed length check: Accept rows with at least 2 columns (Z + some data)
        if len(row) >= 2:
            # Pad with empty strings if fewer than 11 columns
            while len(row) < 11:
                row.append('')
            data.append(row[:11])  # Truncate to 11 if too long
    
    if not data:
        raise ValueError("No valid table data parsed")
    
    # Create DataFrame
    headers = ['Z', '.00', '.01', '.02', '.03', '.04', '.05', '.06', '.07', '.08', '.09']
    df = pd.DataFrame(data, columns=headers)
    df.to_csv(output_file, index=False)
    print(f"Saved {len(data)} rows to {output_file}")

# Handle multi-page PDF or image
def extract_table_to_csv(input_file, output_csv="output.csv"):
    all_dfs = []
    
    if input_file.lower().endswith('.pdf'):
        # Convert PDF to images
        images = convert_from_path(input_file, poppler_path=r'C:\Program Files\poppler-23.11.0\Library\bin', dpi=300)
        for i, image in enumerate(images):
            print(f"Processing page {i+1}")
            img_array = np.array(image)
            img_gray = preprocess_image(img_array)
            table_img, _ = detect_table(img_gray)
            table_text = ocr_table(table_img)
            temp_csv = f"temp_page_{i+1}.csv"
            text_to_csv(table_text, temp_csv)
            all_dfs.append(pd.read_csv(temp_csv))
            os.remove(temp_csv)
    else:
        # Process single image
        img = preprocess_image(cv2.imread(input_file))
        table_img, _ = detect_table(img)
        table_text = ocr_table(table_img)
        text_to_csv(table_text, output_csv)
        return
    
    # Combine all pages into one CSV
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df.to_csv(output_csv, index=False)
    print(f"Combined CSV saved to {output_csv} with {len(combined_df)} rows")

# Test with your PDF
if __name__ == "__main__":
    input_file = "table.pdf"  # Replace with your file path
    extract_table_to_csv(input_file, "normal_table_output.csv")

Processing page 1
Table detected at (x=315, y=422, w=2228, h=2465). Saved as 'table_region_debug.png'
Raw OCR Text:
STANDARD NORMAL DISTRIBUTION: Table Values Represent AREA to the LEFT of the Z score.

Zz       00          01          .02          .03          04          05          06          07          .08          .09
-3.9     .00005      .00005      .00004      .00004      .00004      .00004      .00004      .00004      .00003      .00003
-3.8     .00007      .00007      .00007      .00006      .00006      .00006      .00006      .00005      .00005      .00005
-3.7        .00011          .00010         .00010         .00010         .00009         .00009         .00008         .00008         .00008         .00008
-3.6     .00016      .00015      .00015      .00014      .00014      .00013      .00013      .00012      .00012      .00011
-3.5        .00023         .00022         .00022         .0002 1          .00020         .00019         .00019         .00018         .00017      