In [None]:
# You need to install the following package:
!pip install opencv-python PyMuPDF


In [27]:
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageColor
import fitz  # PyMuPDF

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(np.uint8([[[b * 255, g * 255, r * 255]]]), cv2.COLOR_BGR2HSV)[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(pdf_path, output_path, page_number=0, dpi=300):
    """
    Enhances vertical column separators, preserving image quality.

    Args:
        pdf_path: Path to the PDF.
        output_path: Path to save the image (use .png).
        page_number: Page to process (0-indexed).
        dpi: Resolution for rendering the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # Get Pixmap at specified DPI (this is KEY for quality)
    pix = page.get_pixmap(dpi=dpi)  # High resolution!
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)  # Convert PIL Image to OpenCV format
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # --- 1. Green Row Detection ---
    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = 10
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    h_proj_green = np.sum(green_mask, axis=1)
    green_row_indices = np.where(h_proj_green > 0)[0]

    if len(green_row_indices) > 0:
        top_boundary = green_row_indices[0]
        bottom_boundary = green_row_indices[-1]
    else:
        print("No green rows detected.")
        return

    # --- 2. Header Row Detection ---
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert once, reuse
    header_region = gray[0:top_boundary, :]
    _, binary_header = cv2.threshold(header_region, 180, 255, cv2.THRESH_BINARY_INV)
    h_proj_header = np.sum(binary_header, axis=1)
    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 20:
            header_bottom = i
            break

    # --- 3. Adaptive Thresholding (within table, on GRAYSCALE) ---
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(table_region, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                         cv2.THRESH_BINARY_INV, 11, 3)

    # --- 4. Hough Lines (within table) ---
    lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180, threshold=155,
                            minLineLength=20, maxLineGap=8)

    # --- 5. Filter Vertical Lines ---
    vertical_lines = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x2 - x1) < 5:
                # Adjust coordinates *back* to the full image
                vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))

    # --- 6. Draw Lines (Directly on the OpenCV Image) ---
    # Draw lines directly on the BGR image (no PIL conversion yet)
    print(vertical_lines)
    for x1, y1, x2, y2 in vertical_lines:
        #print(x1)
        #print(y1)
        #print(x2)
        #print(y2)
        #print("---- ")
        cv2.line(img, (x1, header_bottom + 200), (x2, bottom_boundary - 175), (100, 100, 100), 1)

    # --- 7. Convert to PIL and Save (ONE conversion) ---
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))  # Convert back to RGB
    img_pil.save(output_path)


In [28]:
enhance_table_lines_from_pdf_hq("W1.pdf", "output_combined.png", dpi=300)

[(100, 2483, 100, 186), (2380, 2483, 2380, 186), (2271, 2291, 2271, 243), (101, 2483, 101, 186), (2379, 2483, 2379, 186), (206, 2291, 206, 392), (281, 2296, 281, 245), (2270, 2240, 2270, 245), (282, 2296, 282, 245), (2269, 2239, 2269, 245), (283, 2296, 283, 245), (207, 2291, 207, 245), (1439, 392, 1439, 250), (1414, 1460, 1414, 1418), (2009, 881, 2009, 836), (1440, 392, 1440, 245), (806, 1364, 806, 1321), (1438, 2138, 1438, 2094), (2010, 687, 2010, 643), (808, 1752, 808, 1708), (477, 1655, 477, 1611), (1415, 1267, 1415, 1224), (1146, 2042, 1146, 1997), (1437, 392, 1437, 245), (479, 977, 479, 933), (807, 1074, 807, 1030), (1606, 1460, 1606, 1418), (478, 2138, 478, 2094), (925, 491, 925, 448), (2008, 491, 2008, 448), (1605, 1655, 1605, 1611), (1147, 1460, 1147, 1418), (1779, 1848, 1779, 1804), (1048, 1171, 1048, 1128), (923, 784, 923, 740), (638, 2235, 638, 2191), (1047, 1559, 1047, 1514), (1915, 2042, 1915, 1997), (639, 2138, 639, 2094), (922, 589, 922, 545), (1777, 1460, 1777, 1418), (

In [None]:
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageColor
import fitz  # PyMuPDF
import json  # For saving the line positions

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    hsv = cv2.cvtColor(np.uint8([[[b * 255, g * 255, r * 255]]]), cv2.COLOR_BGR2HSV)[0][0]
    return hsv

def enhance_table_lines_from_pdf_and_save(pdf_path, output_image_path, output_json_path, page_number=0, dpi=300):
    """
    Enhances vertical column separators, preserves image quality, and saves line positions.

    Args:
        pdf_path: Path to the PDF.
        output_image_path: Path to save the image (use .png).
        output_json_path: Path to save the line positions (JSON format).
        page_number: Page to process (0-indexed).
        dpi: Resolution for rendering the PDF page.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    pix = page.get_pixmap(dpi=dpi)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # --- 1. Green Row Detection ---
    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = 10
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    h_proj_green = np.sum(green_mask, axis=1)
    green_row_indices = np.where(h_proj_green > 0)[0]

    if len(green_row_indices) > 0:
        top_boundary = green_row_indices[0]
        bottom_boundary = green_row_indices[-1]
    else:
        print("No green rows detected.")
        return

    # --- 2. Header Row Detection ---
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    header_region = gray[0:top_boundary, :]
    _, binary_header = cv2.threshold(header_region, 180, 255, cv2.THRESH_BINARY_INV)
    h_proj_header = np.sum(binary_header, axis=1)
    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 20:
           header_bottom = i
           break
    header_bottom += top_boundary #Correct header

    # --- 3. Adaptive Thresholding ---
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(table_region, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                         cv2.THRESH_BINARY_INV, 11, 3)

    # --- 4. Hough Lines ---
    lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180, threshold=155,
                             minLineLength=20, maxLineGap=8)

    # --- 5. Filter Vertical Lines and Store Positions ---
    vertical_line_positions = []  # List to store x-coordinates
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x2 - x1) < 5:
                # We only need *one* x-coordinate, as the lines are vertical.
                # Use the average of x1 and x2 for robustness.
                x_avg = int((x1 + x2) / 2)  # Ensure integer coordinate
                vertical_line_positions.append(x_avg + top_boundary) # Save position.

    # --- 6. Draw Lines ---
    # Draw lines for visualization (optional, but good for checking)
    for x in vertical_line_positions:
        cv2.line(img, (x, header_bottom), (x, bottom_boundary), (100, 100, 100), 1)

    # --- 7. Convert to PIL and Save Image ---
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    img_pil.save(output_image_path)

    # --- 8. Save Line Positions (JSON) ---
    with open(output_json_path, 'w') as f:
        json.dump(vertical_line_positions, f)

    print(f"Image saved to: {output_image_path}")
    print(f"Line positions saved to: {output_json_path}")
    print(vertical_line_positions)

# Example usage
enhance_table_lines_from_pdf_and_save("W1.pdf", "output_with_lines.png", "line_positions.json")

# Saving JSON Line positions

In [17]:
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageColor
import fitz  # PyMuPDF
import json
import os

def hex_to_hsv(hex_color):
    rgb = ImageColor.getcolor(hex_color, "RGB")
    r, g, b = [x / 255.0 for x in rgb]
    # Note: OpenCV expects BGR order; hence, we multiply by 255 and swap channels.
    hsv = cv2.cvtColor(np.uint8([[[b * 255, g * 255, r * 255]]]), cv2.COLOR_BGR2HSV)[0][0]
    return hsv

def enhance_table_lines_from_pdf_hq(pdf_path, output_path, page_number=0, dpi=300,
                                      vertical_lines_file=None, save_vertical_lines=False):
    """
    Enhances vertical column separators, preserving image quality.
    
    Optionally loads precomputed vertical lines from a JSON file, or saves the computed
    vertical lines for later reuse.
    
    Args:
        pdf_path (str): Path to the PDF.
        output_path (str): Path to save the image (e.g., .png).
        page_number (int): Page to process (0-indexed).
        dpi (int): Resolution for rendering the PDF page.
        vertical_lines_file (str): Path to a JSON file to load/save vertical lines.
        save_vertical_lines (bool): If True, saves computed vertical lines to the file.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number]

    # Get Pixmap at specified DPI (this is KEY for quality)
    pix = page.get_pixmap(dpi=dpi)  # High resolution!
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)  # Convert PIL Image to OpenCV format
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # --- 1. Green Row Detection ---
    target_hsv = hex_to_hsv("#D8EDCF")
    tolerance = 10
    lower_green = np.array([max(0, target_hsv[0] - tolerance), 50, 50])
    upper_green = np.array([min(179, target_hsv[0] + tolerance), 255, 255])
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    h_proj_green = np.sum(green_mask, axis=1)
    green_row_indices = np.where(h_proj_green > 0)[0]

    if len(green_row_indices) > 0:
        top_boundary = green_row_indices[0]
        bottom_boundary = green_row_indices[-1]
    else:
        print("No green rows detected.")
        return

    # --- 2. Header Row Detection ---
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert once, reuse
    header_region = gray[0:top_boundary, :]
    _, binary_header = cv2.threshold(header_region, 180, 255, cv2.THRESH_BINARY_INV)
    h_proj_header = np.sum(binary_header, axis=1)
    header_bottom = 0
    for i in range(len(h_proj_header) - 1, 0, -1):
        if h_proj_header[i] > 20:
            header_bottom = i
            break

    # --- 3. Adaptive Thresholding (within table, on GRAYSCALE) ---
    table_region = gray[top_boundary:bottom_boundary, :]
    thresh_table = cv2.adaptiveThreshold(table_region, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                                         cv2.THRESH_BINARY_INV, 11, 3)

    # --- 4. Determine Vertical Lines ---
    vertical_lines = []
    # If a vertical lines file is specified and exists, load from it.
    if vertical_lines_file is not None and os.path.exists(vertical_lines_file):
        with open(vertical_lines_file, "r") as f:
            vertical_lines = json.load(f)
        # In case the saved data is a list of lists, convert each to a tuple.
        vertical_lines = [tuple(line) for line in vertical_lines]
    else:
        # Otherwise, compute the vertical lines using HoughLinesP.
        lines = cv2.HoughLinesP(thresh_table, 1, np.pi / 180, threshold=155,
                                minLineLength=20, maxLineGap=8)
        if lines is not None:
            for line in lines:
                x1, y1, x2, y2 = line[0]
                if abs(x2 - x1) < 5:
                    # Adjust the coordinates back to the full image space.
                    vertical_lines.append((x1, y1 + top_boundary, x2, y2 + top_boundary))
        else:
            print("No vertical lines detected.")
    
        # Optionally, if the caller requested it, save the computed vertical lines.
        if save_vertical_lines and vertical_lines_file is not None:
            # Convert each tuple to a list for JSON compatibility.
            with open(vertical_lines_file, "w") as f:
                json.dump([[int(x) for x in line] for line in vertical_lines], f)

    # --- 5. Draw Lines (Directly on the OpenCV Image) ---
    print("Vertical lines:", vertical_lines)
    for x1, y1, x2, y2 in vertical_lines:
        # Here, we use the x coordinate from the vertical line (x1 or x2; they are nearly identical)
        # and draw from an adjusted header to an adjusted bottom.
        cv2.line(img, (x1, header_bottom + 200), (x2, bottom_boundary - 175), (100, 100, 100), 1)

    # --- 6. Convert to PIL and Save (ONE conversion) ---
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))  # Convert back to RGB
    img_pil.save(output_path)

# Example usage:
#if __name__ == "__main__":
#    # First run: compute and save the vertical lines.
#    enhance_table_lines_from_pdf_hq("input.pdf", "output1.png",
#                                     vertical_lines_file="vertical_lines.json",
#                                     save_vertical_lines=True)
    
    # Later: load the previously saved vertical lines and apply them to a new document.
#    enhance_table_lines_from_pdf_hq("another_input.pdf", "output2.png",
#                                     vertical_lines_file="vertical_lines.json",
#                                     save_vertical_lines=False)

In [None]:
enhance_table_lines_from_pdf_hq("W1.pdf", "output1.png",
                                     vertical_lines_file="vertical_lines2.json",
                                     save_vertical_lines=True)

In [22]:
enhance_table_lines_from_pdf_hq("Img2.pdf", "TEST2.png",
                                     vertical_lines_file="vertical_lines3.json",
                                     save_vertical_lines=True)

Vertical lines: [(2380, 2495, 2380, 186), (100, 2495, 100, 186), (2379, 2495, 2379, 186), (101, 2495, 101, 186), (283, 2307, 283, 230), (207, 2302, 207, 230), (284, 2307, 284, 230), (2300, 2301, 2300, 230), (2299, 2252, 2299, 230), (206, 327, 206, 230), (282, 2307, 282, 389), (2298, 2252, 2298, 390), (206, 2302, 206, 379), (1456, 379, 1456, 288), (1799, 1856, 1799, 1811), (483, 2149, 483, 2104), (1432, 2149, 1432, 2104), (481, 1758, 481, 1713), (2034, 1954, 2034, 1909), (1626, 1070, 1626, 1025), (934, 1070, 934, 1025), (1457, 972, 1457, 927), (1940, 1070, 1940, 1025), (482, 2051, 482, 2006), (1431, 579, 1431, 534), (932, 874, 932, 829), (1454, 2247, 1454, 2202), (1458, 1364, 1458, 1321), (1060, 776, 1060, 731), (1430, 2247, 1430, 2202), (816, 679, 816, 634), (645, 2051, 645, 2006), (1161, 1856, 1161, 1811), (2036, 1562, 2036, 1517), (2033, 776, 2033, 731), (935, 1856, 935, 1811), (1800, 874, 1800, 829), (815, 1168, 815, 1125), (2032, 1660, 2032, 1615), (1628, 2051, 1628, 2006), (933, 1