In [None]:
import os
import sys
import math
import time
import shutil
import warnings
from pathlib import Path

import cv2
import numpy as np
import torch
import easyocr
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw, ImageFont
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg
from pdf2image import convert_from_path

from transformers import AutoImageProcessor, TableTransformerForObjectDetection
from ultralytics import YOLO
from torchvision import transforms
from tqdm.auto import tqdm

def cv2_imshow(img_array, save_path="temp_display_image.png"):
    """Lưu ảnh tạm thay vì hiển thị trên màn hình"""
    cv2.imwrite(save_path, img_array)
    print(f"Image saved to {save_path}")

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# === Khởi tạo EasyOCR ===
reader = easyocr.Reader(['vi'])

# === Khởi tạo model TableTransformer ===
device = "cuda" if torch.cuda.is_available() else "cpu"
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
model.to(device)
model.eval()


In [None]:
overall_start_time = time.time()
SHOW_PROCESSING_IMAGES: bool = False # Đặt thành False để không hiển thị ảnh

DEFAULT_FONT = None
font_paths_to_try = ["DejaVuSans-Bold.ttf", "arial.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"]
for font_path_item in font_paths_to_try:
    try:
        DEFAULT_FONT = ImageFont.truetype(font_path_item, 12)
        break
    except IOError:
        continue
if DEFAULT_FONT is None:
    DEFAULT_FONT = ImageFont.load_default()

def display_image(title: str, image_np: np.ndarray, max_width: int = 800) -> None:
    if not SHOW_PROCESSING_IMAGES or image_np is None or image_np.size == 0:
        return
    h_orig, w_orig = image_np.shape[:2]
    display_img_resized = image_np
    if w_orig > max_width:
        scale = max_width / w_orig
        new_h = int(h_orig * scale)
        display_img_resized = cv2.resize(image_np, (max_width, new_h), interpolation=cv2.INTER_AREA)
    if lib_status["google_colab"]:
        cv2_imshow(display_img_resized)
    else:
        cv2.imshow(title, display_img_resized)
        cv2.waitKey(1) # Cần thiết cho cv2.imshow hoạt động ngoài Colab
    time.sleep(0.01)

def display_image_with_bboxes(
    title: str, image_np: np.ndarray,
    cv_bbox_abs: tuple[int, int, int, int] | None = None,
    tt_bbox_abs: tuple[int, int, int, int] | None = None,
    max_width: int = 800
) -> None:
    if not SHOW_PROCESSING_IMAGES or image_np is None or image_np.size == 0: return
    display_img_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(display_img_pil)
    if cv_bbox_abs:
        x, y, w, h = cv_bbox_abs
        draw.rectangle([x, y, x + w, y + h], outline="blue", width=3)
        draw.text((x + 2, y + 2), "CV Contour", fill="blue", font=DEFAULT_FONT)
    if tt_bbox_abs:
        x, y, w, h = tt_bbox_abs
        draw.rectangle([x, y, x + w, y + h], outline="red", width=3)
        draw.text((x + 2, y - 15 if y > 15 else y + h + 2), "TT Match", fill="red", font=DEFAULT_FONT)
    display_img_np_annotated = cv2.cvtColor(np.array(display_img_pil), cv2.COLOR_RGB2BGR)
    display_image(title, display_img_np_annotated, max_width)

def display_processed_crop(title: str, processed_crop_np: np.ndarray, max_width: int = 800) -> None:
    if not SHOW_PROCESSING_IMAGES or processed_crop_np is None or processed_crop_np.size == 0: return
    display_image(title, processed_crop_np, max_width)


In [None]:
yolo_model: 'YOLO | None' = None
yolo_device: str = "cuda" if torch.cuda.is_available() else "cpu"
yolo_model_classes: dict[int, str] = {}

def load_yolo_model_safe(model_path_or_id: str, device_yolo: str) -> tuple['YOLO | None', dict[int, str]]:
    global yolo_model_classes
    if not model_path_or_id or not lib_status["ultralytics"]:
        return None, {}

    loaded_model_local: 'YOLO | None' = None
    model_classes_local: dict[int, str] = {}
    try:
        loaded_model_local = YOLO(model_path_or_id)
        loaded_model_local.to(device_yolo)
        dummy_img = np.zeros((64, 64, 3), dtype=np.uint8)
        _ = loaded_model_local(dummy_img, device=(0 if device_yolo == 'cuda' else 'cpu'), verbose=False)

        if hasattr(loaded_model_local, 'names') and loaded_model_local.names:
            raw_names = loaded_model_local.names
            if isinstance(raw_names, dict):
                model_classes_local = {int(k): v for k, v in raw_names.items() if str(k).isdigit()}
            elif isinstance(raw_names, (list, tuple)):
                model_classes_local = {i: name for i, name in enumerate(raw_names)}
        yolo_model_classes = model_classes_local
    except Exception as e_yolo_load:
        loaded_model_local = None
        yolo_model_classes = {}
    return loaded_model_local, model_classes_local


In [None]:
tt_image_processor_detection_global: 'AutoImageProcessor | None' = None
tt_model_detection_global: 'TableTransformerForObjectDetection | None' = None

if lib_status["transformers"]:
    try:
        TT_DETECTION_MODEL_NAME = "microsoft/table-transformer-detection"
        tt_image_processor_detection_global = AutoImageProcessor.from_pretrained(TT_DETECTION_MODEL_NAME)
        tt_model_detection_global = TableTransformerForObjectDetection.from_pretrained(TT_DETECTION_MODEL_NAME)
        tt_device_local = "cuda" if torch.cuda.is_available() else "cpu"
        if tt_model_detection_global is not None : tt_model_detection_global.to(tt_device_local)
    except Exception as e_tt_load:
        tt_model_detection_global = None
        tt_image_processor_detection_global = None
else:
    pass


In [None]:
def order_points(pts: np.ndarray) -> np.ndarray:
    rect = np.zeros((4, 2), dtype="float32"); s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]; rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]; rect[3] = pts[np.argmax(diff)]
    return rect

def get_largest_poly_from_mask(binary_mask_img: np.ndarray) -> np.ndarray | None:
    contours, _ = cv2.findContours(binary_mask_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours: return None
    largest_contour = max(contours, key=cv2.contourArea); peri = cv2.arcLength(largest_contour, True)
    approx_poly = cv2.approxPolyDP(largest_contour, 0.02 * peri, True)
    if len(approx_poly) == 4: return np.squeeze(approx_poly)
    else:
        rect = cv2.minAreaRect(largest_contour)
        box_points = cv2.boxPoints(rect)
        return np.int0(box_points)

def get_max_dimensions(point_array: np.ndarray) -> tuple[int, int]:
    if point_array is None or len(point_array) != 4: return 0, 0
    rect_ordered = order_points(point_array.astype(np.float32)); (tl, tr, br, bl) = rect_ordered
    width_a = np.sqrt(((br[0] - bl[0])**2) + ((br[1] - bl[1])**2))
    width_b = np.sqrt(((tr[0] - tl[0])**2) + ((tr[1] - tl[1])**2))
    max_width = max(int(width_a), int(width_b))
    height_a = np.sqrt(((tr[0] - br[0])**2) + ((tr[1] - br[1])**2))
    height_b = np.sqrt(((tl[0] - bl[0])**2) + ((tl[1] - bl[1])**2))
    max_height = max(int(height_a), int(height_b))
    return max_width, max_height

def get_perspective_transform_matrix(point_array: np.ndarray, target_width: int, target_height: int) -> np.ndarray | None:
    if point_array is None or len(point_array) != 4 or target_width <= 0 or target_height <= 0: return None
    src_pts_ordered = order_points(point_array.astype(np.float32))
    dst_pts_ordered = np.float32([[0,0],[target_width-1,0],[target_width-1,target_height-1],[0,target_height-1]])
    try: return cv2.getPerspectiveTransform(src_pts_ordered, dst_pts_ordered)
    except cv2.error: return None

def warp_image_from_mask(image_bgr: np.ndarray, binary_mask_img: np.ndarray) -> np.ndarray:
    if image_bgr is None or image_bgr.size==0 or binary_mask_img is None or binary_mask_img.size==0: return image_bgr
    poly_coords_from_mask = get_largest_poly_from_mask(binary_mask_img)
    if poly_coords_from_mask is None or len(poly_coords_from_mask) != 4: return image_bgr
    max_w_target, max_h_target = get_max_dimensions(poly_coords_from_mask)
    if max_w_target <= 10 or max_h_target <= 10: return image_bgr
    transform_matrix_calc = get_perspective_transform_matrix(poly_coords_from_mask, max_w_target, max_h_target)
    if transform_matrix_calc is None: return image_bgr
    try:
        warped_image_result = cv2.warpPerspective(image_bgr, transform_matrix_calc, (max_w_target, max_h_target), flags=cv2.INTER_LANCZOS4)
        return warped_image_result if warped_image_result is not None and warped_image_result.size > 0 else image_bgr
    except Exception: return image_bgr


In [None]:
def rotate_image_safe(image: np.ndarray, angle: float,
    border_val: tuple[int, int, int] | int | None = None,
    border_mode: int = cv2.BORDER_REPLICATE
) -> np.ndarray:
    if image is None or image.size == 0 or abs(angle) < 0.01:
        return image
    try:
        h, w = image.shape[:2]; center = (w // 2, h // 2)
        rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
        if border_val is None: border_val = (255,255,255) if image.ndim==3 else 255
        rotated_image = cv2.warpAffine(image,rotation_matrix,(w,h),flags=cv2.INTER_LANCZOS4,borderMode=border_mode,borderValue=border_val)
        return rotated_image if rotated_image is not None and rotated_image.size > 0 else image
    except Exception as e_rotate:
        return image

def deskew_page_basic(image_bgr: np.ndarray, angle_thresh_deg: float = 1.0,
    bg_color: tuple[int, int, int] = (255, 255, 255)
) -> np.ndarray:
    if image_bgr is None or image_bgr.size == 0: return image_bgr

    original_image_copy = image_bgr.copy(); h_orig, w_orig = image_bgr.shape[:2]
    try:
        gray_img = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
        try: thresh_inv_img = cv2.adaptiveThreshold(gray_img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,21,10)
        except cv2.error: _, thresh_inv_img = cv2.threshold(gray_img,0,255,cv2.THRESH_BINARY_INV|cv2.THRESH_OTSU)

        kernel_width = max(15,int(w_orig*0.04)); kernel_height = max(3,int(h_orig*0.005))
        morph_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(kernel_width,kernel_height))
        dilated_img = cv2.dilate(thresh_inv_img,morph_kernel,iterations=2)

        edges_img = cv2.Canny(dilated_img,50,150,apertureSize=3)
        if cv2.countNonZero(edges_img)==0: return original_image_copy

        hough_line_thresh=max(50,int(w_orig*0.1)); min_line_length_hough=max(40,int(w_orig*0.08)); max_line_gap_hough=max(10,int(w_orig*0.02))
        lines_detected = cv2.HoughLinesP(edges_img,1,np.pi/180,hough_line_thresh,minLineLength=min_line_length_hough,maxLineGap=max_line_gap_hough)

        if lines_detected is None or len(lines_detected)==0: return original_image_copy

        detected_angles_deg = [math.degrees(math.atan2(line[0][3]-line[0][1],line[0][2]-line[0][0]))
                               for line in lines_detected
                               if line[0][2]!=line[0][0] and abs(math.degrees(math.atan2(line[0][3]-line[0][1],line[0][2]-line[0][0])))<30.0]

        if not detected_angles_deg: return original_image_copy

        median_detected_angle = float(np.median(detected_angles_deg))
        if not np.isfinite(median_detected_angle): return original_image_copy

        correction_angle_val = -median_detected_angle

        if abs(correction_angle_val) >= angle_thresh_deg:
            deskewed_img_result = rotate_image_safe(original_image_copy, correction_angle_val, border_val=bg_color)
            return deskewed_img_result
        return original_image_copy
    except Exception as e_deskew_page:
        return original_image_copy


In [None]:
def deskew_table_precisely_v2(image_crop_bgr: np.ndarray, **dk_params) -> tuple[np.ndarray, float]:
    if image_crop_bgr is None or image_crop_bgr.size==0: return image_crop_bgr,0.0
    original_crop_copy = image_crop_bgr.copy(); h_crop,w_crop = original_crop_copy.shape[:2]
    if h_crop<10 or w_crop<10: return original_crop_copy,0.0
    applied_correction_angle=0.0; params=dk_params
    try:
        gray_crop = cv2.cvtColor(original_crop_copy,cv2.COLOR_BGR2GRAY)
        try: bin_inv_crop = cv2.adaptiveThreshold(~gray_crop,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,15,-2)
        except cv2.error: _,bin_inv_crop = cv2.threshold(~gray_crop,0,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)

        morph_kernel_w_factor = max(3,int(w_crop*params.get('morph_kernel_width_factor',0.15)))
        h_line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(morph_kernel_w_factor,1))
        h_lines_mask = cv2.morphologyEx(bin_inv_crop,cv2.MORPH_OPEN,h_line_kernel,iterations=params.get('morph_iterations',1))

        if cv2.countNonZero(h_lines_mask)<(w_crop*0.05): return original_crop_copy,0.0

        edges_from_h_mask = cv2.Canny(h_lines_mask,params.get('canny_low_thresh',50),params.get('canny_high_thresh',150),apertureSize=3)
        if cv2.countNonZero(edges_from_h_mask)==0: return original_crop_copy,0.0

        table_lines = cv2.HoughLinesP(edges_from_h_mask,1,np.pi/180,
            threshold=params.get('hough_threshold',25),
            minLineLength=max(15,int(w_crop*params.get('hough_min_line_length_factor',0.2))),
            maxLineGap=max(5,int(w_crop*params.get('hough_max_line_gap_factor',0.05))) )

        if table_lines is None or len(table_lines)==0: return original_crop_copy,0.0

        angle_filter_degrees_param = params.get('angle_filter_degrees',25.0)
        h_detected_angles = [math.degrees(math.atan2(l_seg[0][3]-l_seg[0][1],l_seg[0][2]-l_seg[0][0]))
                             for l_seg in table_lines
                             if l_seg[0][2]!=l_seg[0][0] and abs(math.degrees(math.atan2(l_seg[0][3]-l_seg[0][1],l_seg[0][2]-l_seg[0][0])))<=angle_filter_degrees_param]

        if not h_detected_angles: return original_crop_copy,0.0

        median_horizontal_angle = float(np.median(h_detected_angles))
        if not np.isfinite(median_horizontal_angle): return original_crop_copy,0.0

        h_correction_angle = -median_horizontal_angle

        if abs(h_correction_angle)>params.get('rotation_threshold_degrees',0.2):
            rotated_table_crop = rotate_image_safe(original_crop_copy,h_correction_angle)
            applied_correction_angle = h_correction_angle
            return rotated_table_crop, applied_correction_angle

        return original_crop_copy,0.0
    except Exception as e_deskew_table:
        return original_crop_copy,0.0


In [None]:
def calculate_iou_xywh(box1_xywh:tuple[int,int,int,int],box2_xywh:tuple[int,int,int,int])->float:
    x1_i,y1_i,w1_i,h1_i=box1_xywh; x2_i,y2_i,w2_i,h2_i=box2_xywh
    b1_x_min,b1_y_min,b1_x_max,b1_y_max = x1_i,y1_i,x1_i+w1_i,y1_i+h1_i
    b2_x_min,b2_y_min,b2_x_max,b2_y_max = x2_i,y2_i,x2_i+w2_i,y2_i+h2_i
    inter_x_min=max(b1_x_min,b2_x_min); inter_y_min=max(b1_y_min,b2_y_min)
    inter_x_max=min(b1_x_max,b2_x_max); inter_y_max=min(b1_y_max,b2_y_max)
    if inter_x_max<inter_x_min or inter_y_max<inter_y_min: return 0.0
    intersection_area=(inter_x_max-inter_x_min)*(inter_y_max-inter_y_min)
    b1_area=w1_i*h1_i; b2_area=w2_i*h2_i
    union_area=float(b1_area+b2_area-intersection_area)
    if union_area <= 0:
        return 0.0 if intersection_area == 0 else 1.0
    return intersection_area/union_area

class MaxResize(object):
    def __init__(self, max_size=1000):
        self.max_size = max_size
    def __call__(self, image_pil):
        width, height = image_pil.size
        current_max_size = max(width, height)
        scale = self.max_size / current_max_size
        return image_pil.resize((int(round(scale * width)), int(round(scale * height))))

structure_transform = transforms.Compose([
    MaxResize(1000),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32, device=out_bbox.device)
    return b

def outputs_to_objects(outputs_model, img_size, id2label_map):
    logits = outputs_model.logits.cpu() if outputs_model.logits.is_cuda else outputs_model.logits
    pred_boxes = outputs_model.pred_boxes.cpu() if outputs_model.pred_boxes.is_cuda else outputs_model.pred_boxes
    
    m = logits.softmax(-1).max(-1)
    pred_labels = list(m.indices.detach().numpy())[0]
    pred_scores = list(m.values.detach().numpy())[0]
    
    pred_bboxes_scaled = [elem.tolist() for elem in rescale_bboxes(pred_boxes[0], img_size)]

    objects = []
    for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes_scaled):
        class_label = id2label_map[int(label)]
        if class_label != "no object":
            objects.append({'label': class_label, 'score': float(score), 'bbox': bbox})
    return objects

def get_cell_coordinates_by_row(table_data):
    rows = [entry for entry in table_data if entry['label'] == 'table row']
    columns = [entry for entry in table_data if entry['label'] == 'table column']
    
    rows.sort(key=lambda x: x['bbox'][1])
    columns.sort(key=lambda x: x['bbox'][0])

    def find_cell_coordinates(row_bbox, col_bbox):
        return [col_bbox[0], row_bbox[1], col_bbox[2], row_bbox[3]]

    cell_coords_by_row = []
    for r_entry in rows:
        row_cells_data = []
        for c_entry in columns:
            cell_bbox = find_cell_coordinates(r_entry['bbox'], c_entry['bbox'])
            row_cells_data.append({'column_bbox': c_entry['bbox'], 'cell_bbox': cell_bbox})
        
        row_cells_data.sort(key=lambda x: x['column_bbox'][0])
        cell_coords_by_row.append({'row_bbox': r_entry['bbox'], 'cells_data': row_cells_data, 'cell_count': len(row_cells_data)})
    
    cell_coords_by_row.sort(key=lambda x: x['row_bbox'][1])
    return cell_coords_by_row

def apply_ocr_to_cells(cell_coords_list, pil_image, reader_instance):
    if reader_instance is None: return {} # Tránh lỗi nếu reader không được khởi tạo
    ocr_data = dict()
    max_cols = 0
    
    for r_idx, row_data_item in enumerate(tqdm(cell_coords_list, desc="OCR Rows", leave=False) if 'tqdm' in globals() and SHOW_PROCESSING_IMAGES else cell_coords_list):
        row_ocr_texts = []
        for cell_info in row_data_item["cells_data"]:
            cropped_cell_pil = pil_image.crop(cell_info["cell_bbox"])
            cell_image_np = np.array(cropped_cell_pil)
            
            ocr_result = reader_instance.readtext(cell_image_np)
            if ocr_result:
                text = " ".join([item[1] for item in ocr_result])
                row_ocr_texts.append(text)
            else:
                row_ocr_texts.append("")
        
        if len(row_ocr_texts) > max_cols:
            max_cols = len(row_ocr_texts)
        ocr_data[r_idx] = row_ocr_texts
    
    for r_key, r_val in ocr_data.items():
        if len(r_val) != max_cols:
            ocr_data[r_key] = r_val + [""] * (max_cols - len(r_val))
    return ocr_data


def detect_tables_cv_tt_combined(
    page_image_path:str|Path, output_dir_prefix:str,
    cv_detect_params:dict, table_deskew_params:dict,
    iou_match_threshold:float=0.4, tt_page_confidence_threshold:float=0.6
) -> list[str]:
    global tt_model_detection_global, tt_image_processor_detection_global, model_structure_recognition, device, ocr_reader
    saved_crop_paths_list:list[str]=[]; page_file_path=Path(page_image_path); page_base_name=page_file_path.name
    img_page_bgr_original_data=cv2.imread(str(page_file_path))
    if img_page_bgr_original_data is None: return []

    page_height,page_width = img_page_bgr_original_data.shape[:2]
    if page_height<=10 or page_width<=10: return []

    cv_candidate_bboxes_xywh_list:list[tuple[int,int,int,int]]=[]
    try:
        dt_params_cv=cv_detect_params; gray_page_img=cv2.cvtColor(img_page_bgr_original_data,cv2.COLOR_BGR2GRAY)
        try:thresh_inv_page_img=cv2.adaptiveThreshold(~gray_page_img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,15,-2)
        except:_,thresh_inv_page_img=cv2.threshold(~gray_page_img,0,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)

        kernel_h_width=max(20,int(page_width*dt_params_cv.get('min_line_ratio',0.02)))
        kernel_v_height=max(20,int(page_height*dt_params_cv.get('min_line_ratio',0.02)))
        kernel_h_lines=cv2.getStructuringElement(cv2.MORPH_RECT,(kernel_h_width,1))
        kernel_v_lines=cv2.getStructuringElement(cv2.MORPH_RECT,(1,kernel_v_height))

        opened_h_lines_mask=cv2.morphologyEx(thresh_inv_page_img,cv2.MORPH_OPEN,kernel_h_lines)
        opened_v_lines_mask=cv2.morphologyEx(thresh_inv_page_img,cv2.MORPH_OPEN,kernel_v_lines)

        combined_lines_mask=cv2.addWeighted(opened_h_lines_mask,0.5,opened_v_lines_mask,0.5,0.0)

        dilate_iterations_cv=dt_params_cv.get('dilate_iter',2)
        if dilate_iterations_cv > 0:
            final_cv_mask = cv2.dilate(combined_lines_mask,cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)),iterations=dilate_iterations_cv)
        else:
            final_cv_mask = combined_lines_mask

        if final_cv_mask is None or cv2.countNonZero(final_cv_mask)==0: raise ValueError("Mặt nạ OpenCV rỗng.")

        cv_contours_found,_=cv2.findContours(final_cv_mask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
        if not cv_contours_found: raise ValueError("Không tìm thấy đường viền nào bởi OpenCV.")

        min_area_pixels_cv=dt_params_cv.get('min_area_ratio',0.008)*page_width*page_height
        min_dimension_pixels_cv=dt_params_cv.get('min_dim_px',25)

        candidate_contours_data_list = []
        for c_item in cv_contours_found:
            x_c, y_c, w_c, h_c = cv2.boundingRect(c_item)
            area_c = cv2.contourArea(c_item)
            if area_c > min_area_pixels_cv and w_c > min_dimension_pixels_cv and h_c > min_dimension_pixels_cv:
                candidate_contours_data_list.append({'bbox_xywh': (x_c, y_c, w_c, h_c), 'area': area_c})

        if not candidate_contours_data_list: raise ValueError("Không có đường viền OpenCV nào vượt qua bộ lọc kích thước/diện tích.")

        candidate_contours_data_list.sort(key=lambda item:item['area'],reverse=True)

        selected_indices_cv=set(); final_cv_candidates_list=[]
        overlap_threshold_cv=dt_params_cv.get('overlap_threshold_ratio',0.7)

        for i_idx in range(len(candidate_contours_data_list)):
            if i_idx in selected_indices_cv: continue

            current_contour_data=candidate_contours_data_list[i_idx]
            final_cv_candidates_list.append(current_contour_data['bbox_xywh'])

            for j_idx in range(i_idx + 1, len(candidate_contours_data_list)):
                if j_idx in selected_indices_cv: continue

                other_contour_data=candidate_contours_data_list[j_idx]
                xA_overlap=max(current_contour_data['bbox_xywh'][0],other_contour_data['bbox_xywh'][0])
                yA_overlap=max(current_contour_data['bbox_xywh'][1],other_contour_data['bbox_xywh'][1])
                xB_overlap=min(current_contour_data['bbox_xywh'][0]+current_contour_data['bbox_xywh'][2],other_contour_data['bbox_xywh'][0]+other_contour_data['bbox_xywh'][2])
                yB_overlap=min(current_contour_data['bbox_xywh'][1]+current_contour_data['bbox_xywh'][3],other_contour_data['bbox_xywh'][1]+other_contour_data['bbox_xywh'][3])
                intersection_area_val=max(0,xB_overlap-xA_overlap)*max(0,yB_overlap-yA_overlap)

                if intersection_area_val > 0 and (intersection_area_val/min(current_contour_data['area'],other_contour_data['area'])) > overlap_threshold_cv:
                    selected_indices_cv.add(j_idx)

        cv_candidate_bboxes_xywh_list = final_cv_candidates_list
    except Exception as e_cv_detect: cv_candidate_bboxes_xywh_list=[]

    tt_detected_bboxes_on_page_xywh_list:list[tuple[int,int,int,int]]=[]
    if tt_model_detection_global and tt_image_processor_detection_global and lib_status["transformers"]:
        try:
            pil_page_img = Image.fromarray(cv2.cvtColor(img_page_bgr_original_data,cv2.COLOR_BGR2RGB))
            tt_inputs = tt_image_processor_detection_global(images=pil_page_img,return_tensors="pt")

            tt_model_device = next(tt_model_detection_global.parameters()).device
            tt_inputs = {k_item:v_item.to(tt_model_device) for k_item,v_item in tt_inputs.items()}

            with torch.no_grad(): tt_outputs = tt_model_detection_global(**tt_inputs)

            tt_target_sizes=torch.tensor([pil_page_img.size[::-1]],device=tt_model_device)
            tt_results_list = tt_image_processor_detection_global.post_process_object_detection(
                tt_outputs, threshold=tt_page_confidence_threshold, target_sizes=tt_target_sizes
            )[0]

            min_dim_pixels_tt = cv_detect_params.get('min_dim_px',25)
            for score_val,label_val,box_xyxy_val in zip(tt_results_list["scores"],tt_results_list["labels"],tt_results_list["boxes"]):
                if "table" in tt_model_detection_global.config.id2label[label_val.item()].lower():
                    xmin_val,ymin_val,xmax_val,ymax_val=[int(round(c_coord.item())) for c_coord in box_xyxy_val]
                    w_val,h_val=xmax_val-xmin_val,ymax_val-ymin_val
                    if w_val > min_dim_pixels_tt and h_val > min_dim_pixels_tt:
                        tt_detected_bboxes_on_page_xywh_list.append((xmin_val,ymin_val,w_val,h_val))
        except Exception as e_tt_page_detect: tt_detected_bboxes_on_page_xywh_list=[]
    else:
        pass

    final_confirmed_pairs_data_list = []

    if cv_candidate_bboxes_xywh_list and tt_detected_bboxes_on_page_xywh_list:
        sorted_cv_bboxes_list = sorted(cv_candidate_bboxes_xywh_list,key=lambda b_item:(b_item[1],b_item[0]))

        for cv_bbox_item in sorted_cv_bboxes_list:
            best_iou_for_this_cv_bbox = 0.0
            best_tt_match_for_this_cv_bbox = None

            for tt_bbox_item in tt_detected_bboxes_on_page_xywh_list:
                iou_val = calculate_iou_xywh(cv_bbox_item, tt_bbox_item)
                if iou_val > best_iou_for_this_cv_bbox:
                    best_iou_for_this_cv_bbox = iou_val
                    best_tt_match_for_this_cv_bbox = tt_bbox_item

            if best_iou_for_this_cv_bbox > iou_match_threshold and best_tt_match_for_this_cv_bbox is not None:
                final_confirmed_pairs_data_list.append({
                    "cv_bbox": cv_bbox_item,
                    "tt_bbox": best_tt_match_for_this_cv_bbox,
                    "iou": best_iou_for_this_cv_bbox
                })

    if not final_confirmed_pairs_data_list:
        return []

    processed_table_count = 0
    for pair_index, pair_item_data in enumerate(final_confirmed_pairs_data_list):
        cv_bbox_to_process_item = pair_item_data["cv_bbox"]
        matched_tt_bbox_absolute = pair_item_data["tt_bbox"]
        iou_score_val = pair_item_data["iou"]

        if SHOW_PROCESSING_IMAGES:
            display_image_with_bboxes(
                title=f"Trang {page_base_name} - Khớp {pair_index+1} (IoU {iou_score_val:.2f}) - Bối cảnh trang gốc",
                image_np=img_page_bgr_original_data,
                cv_bbox_abs=cv_bbox_to_process_item,
                tt_bbox_abs=matched_tt_bbox_absolute
            )

        x_cv_coord, y_cv_coord, w_cv_dim, h_cv_dim = cv_bbox_to_process_item
        padding_val = cv_detect_params.get('padding',5)

        y1_crop_coord = max(0, y_cv_coord - padding_val)
        y2_crop_coord = min(page_height, y_cv_coord + h_cv_dim + padding_val)
        x1_crop_coord = max(0, x_cv_coord - padding_val)
        x2_crop_coord = min(page_width, x_cv_coord + w_cv_dim + padding_val)

        if (x2_crop_coord - x1_crop_coord) <= 10 or (y2_crop_coord - y1_crop_coord) <= 10:
            continue

        cv_crop_bgr_padded_img = img_page_bgr_original_data[y1_crop_coord:y2_crop_coord, x1_crop_coord:x2_crop_coord].copy()
        if cv_crop_bgr_padded_img is None or cv_crop_bgr_padded_img.size == 0:
            continue

        deskewed_table_crop_img = cv_crop_bgr_padded_img.copy()
        try:
            deskewed_output_img, applied_angle_val = deskew_table_precisely_v2(cv_crop_bgr_padded_img, **table_deskew_params)
            if deskewed_output_img is not None and deskewed_output_img.size > 0:
                deskewed_table_crop_img = deskewed_output_img
                if abs(applied_angle_val) > 0.01:
                     pass
        except Exception as e_deskew_crop:
            pass

        final_processed_crop_for_saving_img = deskewed_table_crop_img.copy()
        try:
            if deskewed_table_crop_img is not None and deskewed_table_crop_img.size > 0:
                gray_for_warp_mask_img = cv2.cvtColor(deskewed_table_crop_img, cv2.COLOR_BGR2GRAY)
                _, bin_for_warp_mask_img = cv2.threshold(gray_for_warp_mask_img, 0, 255,
                                                    cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

                warped_output_img = warp_image_from_mask(deskewed_table_crop_img, bin_for_warp_mask_img)
                if warped_output_img is not None and warped_output_img.size > 0 and \
                   warped_output_img.shape[0] > 10 and warped_output_img.shape[1] > 10:
                    final_processed_crop_for_saving_img = warped_output_img
        except Exception as e_warp_crop:
            pass
        
        if final_processed_crop_for_saving_img is not None and final_processed_crop_for_saving_img.size > 0:
            temp_crop_file_path = None
            try:
                with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
                    temp_crop_file_path = tmpfile.name
                
                if not cv2.imwrite(temp_crop_file_path, final_processed_crop_for_saving_img):
                    if temp_crop_file_path and os.path.exists(temp_crop_file_path):
                         os.remove(temp_crop_file_path)
                    temp_crop_file_path = None
                    continue 

                image_pil_from_temp = Image.open(temp_crop_file_path).convert("RGB")
                
                pixel_values = structure_transform(image_pil_from_temp).unsqueeze(0).to(device)

                with torch.no_grad():
                    outputs = model_structure_recognition(pixel_values)

                structure_id2label = model_structure_recognition.config.id2label.copy()
                structure_id2label[len(structure_id2label)] = "no object"
                
                cells = outputs_to_objects(outputs, image_pil_from_temp.size, structure_id2label)
                cell_coordinates_data = get_cell_coordinates_by_row(cells)
                data_ocr = apply_ocr_to_cells(cell_coordinates_data, image_pil_from_temp, ocr_reader)
                import csv
                with open("table_output.csv", "w", newline='', encoding='utf-8') as f:
                  writer = csv.writer(f)
                  for row in data_ocr.values():
                      writer.writerow(row)
                      
                for row_values in data_ocr.values():
                    pass 

                safe_prefix_filename = "".join(c_char if c_char.isalnum() or c_char in ('_', '-') else '_' for c_char in Path(output_dir_prefix).name)
                output_table_filename = f"{safe_prefix_filename}_table_CVmatchTT_{processed_table_count:02d}_iou{iou_score_val:.2f}.png"
                output_table_path = Path(Path(output_dir_prefix).parent) / output_table_filename

                try:
                    if cv2.imwrite(str(output_table_path), final_processed_crop_for_saving_img):
                        saved_crop_paths_list.append(str(output_table_path))
                    else:
                        pass
                except Exception as e_save_table:
                    pass
            
            except Exception as e_processing:
                pass
            finally:
                if temp_crop_file_path and os.path.exists(temp_crop_file_path):
                    try:
                        os.remove(temp_crop_file_path)
                    except OSError:
                        pass
            processed_table_count +=1
        else:
            pass

    return saved_crop_paths_list


In [None]:
#Pipe_line

def process_pdf_for_tables_pipeline(
    pdf_file_path_main:str|Path, base_output_dir_prefix:str, pdf_processing_dpi:int, enable_pdf_page_deskew:bool,
    pdf_page_deskew_angle_thresh:float, cv_detection_config_main:dict, table_deskew_config_main:dict,
    iou_matching_thresh_config:float, tt_page_confidence_thresh_config_val:float, temp_pdf_pages_dir_path:str|Path
) -> tuple[dict[int,list[str]],int]:

    all_page_processing_results:dict[int,list[str]]={}; total_tables_extracted_count=0

    pdf_path_obj=Path(pdf_file_path_main)
    temp_pages_dir_obj=Path(temp_pdf_pages_dir_path)

    shutil.rmtree(temp_pages_dir_obj, ignore_errors=True)
    os.makedirs(temp_pages_dir_obj, exist_ok=True)

    output_main_dir=Path(base_output_dir_prefix).parent
    if not output_main_dir.exists():
        os.makedirs(output_main_dir, exist_ok=True)

    if not lib_status["pdf2image"]:
        return {}, 0

    pdf_page_pil_images_list:list[Image.Image]=[]
    try:
        start_conversion_time=time.time(); cpu_cores_count=os.cpu_count()or 4; thread_count_val=max(1,cpu_cores_count//2)
        pdf_page_pil_images_list = convert_from_path(
            pdf_path_obj, dpi=pdf_processing_dpi, fmt='png',
            thread_count=thread_count_val, use_pdftocairo=True
        )
        conversion_duration_sec=time.time()-start_conversion_time
        if not pdf_page_pil_images_list:
            raise ValueError("Chuyển đổi PDF cho ra 0 hình ảnh.")
    except Exception as e_pdf_convert:
        shutil.rmtree(temp_pages_dir_obj, ignore_errors=True)
        return {}, 0

    for current_page_idx, current_pil_image in enumerate(pdf_page_pil_images_list):
        current_page_num = current_page_idx + 1
        temp_page_image_file_path:Path|None=None

        try:
            cv_bgr_page_image = cv2.cvtColor(np.array(current_pil_image), cv2.COLOR_RGB2BGR)
            if cv_bgr_page_image is None or cv_bgr_page_image.size == 0:
                all_page_processing_results[current_page_num] = []
                continue

            image_to_process_for_saving = cv_bgr_page_image.copy()

            if enable_pdf_page_deskew:
                deskewed_page_version = deskew_page_basic(cv_bgr_page_image, pdf_page_deskew_angle_thresh)
                image_to_process_for_saving = deskewed_page_version

            temp_page_image_filename = f"page_{current_page_num:03d}.png"
            temp_page_image_file_path = temp_pages_dir_obj / temp_page_image_filename

            if not cv2.imwrite(str(temp_page_image_file_path), image_to_process_for_saving):
                all_page_processing_results[current_page_num] = []
                continue

            current_page_output_prefix = f"{base_output_dir_prefix}_pg{current_page_num:03d}"

            saved_table_paths_for_this_page = detect_tables_cv_tt_combined(
                page_image_path=temp_page_image_file_path, output_dir_prefix=current_page_output_prefix,
                cv_detect_params=cv_detection_config_main, table_deskew_params=table_deskew_config_main,
                iou_match_threshold=iou_matching_thresh_config, tt_page_confidence_threshold=tt_page_confidence_thresh_config_val
            )

            all_page_processing_results[current_page_num] = saved_table_paths_for_this_page
            num_tables_on_this_page = len(saved_table_paths_for_this_page)
            total_tables_extracted_count += num_tables_on_this_page

        except KeyboardInterrupt:
            shutil.rmtree(temp_pages_dir_obj, ignore_errors=True)
            raise
        except Exception as e_process_page:
            import traceback
            all_page_processing_results[current_page_num] = []
        finally:
            if temp_page_image_file_path and temp_page_image_file_path.exists():
                try: os.remove(temp_page_image_file_path)
                except OSError as e_remove_temp: pass

    shutil.rmtree(temp_pages_dir_obj, ignore_errors=True)
    return all_page_processing_results, total_tables_extracted_count



In [None]:
CONFIG = {
    "PDF_DPI":200,
    "ENABLE_PAGE_DESKEW":True,
    "PAGE_DESKEW_ANGLE_THRESHOLD":0.5,
    "CV_DETECT_PARAMS":{
        'min_line_ratio':0.015,
        'min_area_ratio':0.005,
        'min_dim_px':20,
        'dilate_iter':2,
        'padding':10,
        'overlap_threshold_ratio':0.65
    },
    "TABLE_DESKEW_PARAMS":{
        'morph_kernel_width_factor':0.12,
        'morph_iterations':1,
        'hough_threshold':20,
        'hough_min_line_length_factor':0.15,
        'hough_max_line_gap_factor':0.04,
        'canny_low_thresh':40,
        'canny_high_thresh':120,
        'angle_filter_degrees':20.0,
        'rotation_threshold_degrees':0.20
    },
    "IOU_MATCH_THRESHOLD":0.35,
    "TT_PAGE_CONFIDENCE_THRESHOLD":0.9,
    "OUTPUT_BASE_DIR":Path("./table_extraction_output_cv_priority"), # Sửa thành ./ cho local
    "TEMP_PDF_PAGES_DIR":Path("./temp_pdf_pages_cv_priority"), # Sửa thành ./ cho local
    "YOLO_ENABLE_VALIDATION":False,
    "YOLO_MODEL_PATH":"keremberke/yolov8n-table-detection",
}

if CONFIG["YOLO_ENABLE_VALIDATION"] and lib_status["ultralytics"]:
    yolo_model, _ = load_yolo_model_safe(CONFIG["YOLO_MODEL_PATH"],yolo_device)
    if yolo_model:
        pass
elif CONFIG["YOLO_ENABLE_VALIDATION"]:
    pass

shutil.rmtree(CONFIG["OUTPUT_BASE_DIR"],ignore_errors=True)
shutil.rmtree(CONFIG["TEMP_PDF_PAGES_DIR"],ignore_errors=True)
os.makedirs(CONFIG["OUTPUT_BASE_DIR"],exist_ok=True)
os.makedirs(CONFIG["TEMP_PDF_PAGES_DIR"],exist_ok=True)


uploaded_pdf_file_path_main:Path|None=None
if lib_status["google_colab"]:
    try:
        content_main_dir=Path("/content/")
        for item_file in content_main_dir.iterdir():
            if item_file.is_file() and not item_file.name.startswith('.'):
                try: item_file.unlink()
                except OSError: pass

        uploaded_colab_files = files.upload()
        if uploaded_colab_files:
            uploaded_pdf_filename=list(uploaded_colab_files.keys())[0]
            source_pdf_path_in_colab_runtime = content_main_dir / uploaded_pdf_filename

            if uploaded_pdf_filename.lower().endswith(".pdf"):
                if not source_pdf_path_in_colab_runtime.exists():
                    raise FileNotFoundError("Tệp PDF đã tải lên dường như đã biến mất ngay lập tức.")

                final_pdf_destination_path = CONFIG["OUTPUT_BASE_DIR"] / uploaded_pdf_filename
                try:
                    shutil.move(str(source_pdf_path_in_colab_runtime), str(final_pdf_destination_path))
                    uploaded_pdf_file_path_main = final_pdf_destination_path
                except Exception as e_move_pdf:
                    uploaded_pdf_file_path_main = source_pdf_path_in_colab_runtime
            else:
                if source_pdf_path_in_colab_runtime.exists(): source_pdf_path_in_colab_runtime.unlink()
        else:
            pass
    except Exception as e_upload_pdf:
        pass
else:
    # Để kiểm thử cục bộ, bạn có thể đặt `uploaded_pdf_file_path_main` thủ công ở đây:
    local_test_pdf_path = Path("./example.pdf") # << THAY ĐỔI ĐƯỜNG DẪN NÀY
    if local_test_pdf_path.exists() and local_test_pdf_path.is_file() and local_test_pdf_path.name.lower().endswith(".pdf"):
        # Copy file vào output dir để giữ file gốc
        destination_in_output = CONFIG["OUTPUT_BASE_DIR"] / local_test_pdf_path.name
        try:
            shutil.copy2(local_test_pdf_path, destination_in_output)
            uploaded_pdf_file_path_main = destination_in_output
        except Exception:
             uploaded_pdf_file_path_main = local_test_pdf_path # Fallback to original if copy fails
    else:
        pass


if not uploaded_pdf_file_path_main or not uploaded_pdf_file_path_main.exists():
    sys.exit("Thực thi bị dừng: Không có PDF hợp lệ được cung cấp hoặc tìm thấy.")
else:
    pass


In [None]:
# ==============================================================================
#                 *** MAIN ***
# ==============================================================================
if uploaded_pdf_file_path_main and uploaded_pdf_file_path_main.exists():
    pdf_basename_val=uploaded_pdf_file_path_main.name
    pdf_name_no_ext_val=uploaded_pdf_file_path_main.stem

    current_pdf_output_base_prefix = str(CONFIG["OUTPUT_BASE_DIR"] / pdf_name_no_ext_val)

    processing_start_main_time = time.time()
    try:
        page_results_dict, total_tables_extracted_final_count = process_pdf_for_tables_pipeline(
            pdf_file_path_main=uploaded_pdf_file_path_main,
            base_output_dir_prefix=current_pdf_output_base_prefix,
            pdf_processing_dpi=CONFIG["PDF_DPI"],
            enable_pdf_page_deskew=CONFIG["ENABLE_PAGE_DESKEW"],
            pdf_page_deskew_angle_thresh=CONFIG["PAGE_DESKEW_ANGLE_THRESHOLD"],
            cv_detection_config_main=CONFIG["CV_DETECT_PARAMS"],
            table_deskew_config_main=CONFIG["TABLE_DESKEW_PARAMS"],
            iou_matching_thresh_config=CONFIG["IOU_MATCH_THRESHOLD"],
            tt_page_confidence_thresh_config_val=CONFIG["TT_PAGE_CONFIDENCE_THRESHOLD"],
            temp_pdf_pages_dir_path=CONFIG["TEMP_PDF_PAGES_DIR"]
        )

        processing_duration_main_sec = time.time() - processing_start_main_time

    except KeyboardInterrupt:
        pass
    except Exception as e_main_process:
        import traceback
else:
    pass