In [1]:
import cv2
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import os

# Set Tesseract path (adjust for your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows path example

# Preprocess image for better OCR
def preprocess_image(image):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.convertScaleAbs(image, alpha=1.5, beta=0)  # Increase contrast
    image = cv2.GaussianBlur(image, (5, 5), 0)
    _, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return image

# Detect table boundaries
def detect_table(image):
    edges = cv2.Canny(image, 30, 100)
    kernel = np.ones((5, 5), np.uint8)
    dilated = cv2.dilate(edges, kernel, iterations=2)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        print("No table detected, using full image")
        return image, (0, 0, image.shape[1], image.shape[0])
    
    max_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(max_contour)
    
    padding = 50
    x, y, w, h = max(0, x - padding), max(0, y - padding), w + 2 * padding, h + 2 * padding
    x, y = max(0, x), max(0, y)
    w, h = min(w, image.shape[1] - x), min(h, image.shape[0] - y)
    
    table_region = image[y:y + h, x:x + w]
    return table_region, (x, y, w, h)

# Extract text with OCR
def ocr_table(table_image):
    custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'
    text = pytesseract.image_to_string(table_image, config=custom_config)
    return text

# Convert OCR text to structured table with dynamic headers
def text_to_csv(text, output_file):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    if len(lines) < 2:  # Need at least header and one data row
        raise ValueError("Insufficient data for a table")
    
    # Assume first line with numbers or multiple entries is the header
    header_line = None
    data_lines = []
    for i, line in enumerate(lines):
        cells = [cell.strip() for cell in line.split() if cell.strip()]
        if len(cells) > 1:  # Look for a line with multiple columns
            header_line = cells
            data_lines = lines[i + 1:]
            break
    
    if not header_line:
        raise ValueError("No header row detected")
    
    # Process data rows
    data = []
    for line in data_lines:
        row = [cell.strip() for cell in line.split() if cell.strip()]
        if row:  # Only process non-empty rows
            # Pad or truncate to match header length
            while len(row) < len(header_line):
                row.append('')
            data.append(row[:len(header_line)])  # Truncate if too long
    
    if not data:
        raise ValueError("No valid data rows detected")
    
    # Create DataFrame with dynamic headers
    df = pd.DataFrame(data, columns=header_line)
    df.to_csv(output_file, index=False)
    print(f"Saved {len(data)} rows to {output_file} with headers: {header_line}")

# Handle multi-page PDF
def extract_table_to_csv(input_pdf, output_csv="output.csv"):
    all_dfs = []
    
    # Convert PDF to images
    images = convert_from_path(input_pdf, poppler_path=r'C:\Program Files\poppler-23.11.0\Library\bin', dpi=300)
    
    for i, image in enumerate(images):
        print(f"Processing page {i + 1}")
        img_array = np.array(image)
        img_gray = preprocess_image(img_array)
        table_img, _ = detect_table(img_gray)
        table_text = ocr_table(table_img)
        temp_csv = f"temp_page_{i + 1}.csv"
        
        try:
            text_to_csv(table_text, temp_csv)
            all_dfs.append(pd.read_csv(temp_csv))
            os.remove(temp_csv)
        except ValueError as e:
            print(f"Error on page {i + 1}: {e}")
            continue
    
    if not all_dfs:
        print("No tables extracted from the PDF")
        return
    
    # Combine all pages into one CSV
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df.to_csv(output_csv, index=False)
    print(f"Combined CSV saved to {output_csv} with {len(combined_df)} rows")

# Test the function
if __name__ == "__main__":
    input_pdf = "test_farhan.pdf" # Replace with your PDF file path
    output_csv = "table_output.csv"
    extract_table_to_csv(input_pdf, output_csv)

Processing page 1
Error on page 1: No header row detected
Processing page 2
Error on page 2: No header row detected
Processing page 3
Error on page 3: No header row detected
No tables extracted from the PDF


In [2]:
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'new.ipynb',
 'pdf_csv_techniques (OpenCV) and OCR (Tesseract).ipynb',
 'README.md',
 'table.pdf',
 'table_region_debug.png',
 'test_farhan.pdf']