In [4]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import numpy as np
import json
import os
from statistics import quantiles

ModuleNotFoundError: No module named 'pdf2image'

In [None]:
pdf_path = "../CSR Reporting/NASDAQ/NASDAQ_TEAM_2021.pdf"  # 替換為您的 PDF 路徑
output_image_path = "../CSR_report_processed_v3/NASDAQ/NASDAQ_TEAM_2021"
output_file_path = "../CSR_report_processed_v3/NASDAQ/NASDAQ_TEAM_2021/NASDAQ_TEAM_2021.json"

In [None]:
def process_pdf(pdf_path, output_image_path, density_threshold=15):

    if not os.path.exists(output_image_path):
        os.makedirs(output_image_path)
    # 轉換 PDF 頁面為圖片
    pages = convert_from_path(pdf_path, dpi=300)
    all_dense_regions = []
    all_region_data = []

    for page_i, page_image in enumerate(pages):
        print(f"\n--- Processing Page {page_i + 1} ---")

        # OCR 文字區塊提取
        data = pytesseract.image_to_data(page_image, output_type=pytesseract.Output.DICT)

        if len(data['text']) < 300:
            continue

        valid_indices = [i for i in range(len(data['text'])) if data['text'][i].strip()]

        if not valid_indices:
            print(f"Page {page_i + 1}: No valid text found.")
            continue

        # 計算平均寬度與高度作為閾值
        def trimmed_mean(values, trim_ratio=0.25):
            sorted_values = np.sort(values)
            trim_count = int(len(sorted_values) * trim_ratio)
            trimmed_values = sorted_values[:trim_count]
            return np.mean(trimmed_values) if len(trimmed_values) > 0 else 0

        non_zero_widths = [data['width'][i] for i in valid_indices if data['width'][i] > 0]
        non_zero_heights = [data['height'][i] for i in valid_indices if data['height'][i] > 0]

        average_width = trimmed_mean(non_zero_widths)
        average_height = sum(data['height'][i] for i in valid_indices) / len(valid_indices)
        # average_height = trimmed_mean(non_zero_heights, trim_ratio=0.3)

        # 設定合併閾值
        horizontal_threshold = int(average_width)
        vertical_threshold = int(average_height)

        print(f"Avg Width: {average_width}, Avg Height: {average_height}")
        print(f"Horizontal Threshold: {horizontal_threshold}, Vertical Threshold: {vertical_threshold}")

        # **建立所有文字區塊**
        region_candidates = []
        for i in valid_indices:
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            text = data['text'][i].strip()
            text_length = len(text)
            region_candidates.append([x, y, x + w, y + h, text_length, text, 1])  # 1 代表這個區塊內的文字數量

        # 記錄所有的文字區塊資訊
        region_data = []
        for region in region_candidates:
            region_data.append({
                "page": page_i + 1,
                "x1": region[0], "y1": region[1],
                "x2": region[2], "y2": region[3],
                "width": region[2] - region[0],
                "height": region[3] - region[1],
                "text_length": region[4],
                "text": region[5],
                "text_num": region[6]
            })

        all_region_data.extend(region_data)

        # **建立區塊關係圖**
        adjacency_list = {i: [] for i in range(len(region_candidates))}

        # 檢查所有文字區塊是否相連
        for i, region1 in enumerate(region_candidates):
            print(f"Region {i}: {region}")
            x1, y1, x2, y2, _, _, _ = region1

            for j, region2 in enumerate(region_candidates):
                if i != j:
                    x1b, y1b, x2b, y2b, _, _, _ = region2

                    # 判斷兩個區塊是否可以合併
                    horizontally_close = abs(x1 - x2b) <= horizontal_threshold or abs(x1b - x2) <= horizontal_threshold
                    vertically_close = abs(y1 - y2b) <= vertical_threshold or abs(y1b - y2) <= vertical_threshold
                    overlap_vertically = not (y1 > y2b or y2 < y1b)
                    overlap_horizontally = not (x1 > x2b or x2 < x1b)

                    if (horizontally_close or overlap_horizontally) and (vertically_close or overlap_vertically):
                        adjacency_list[i].append(j)
                        print(f"Page {page_i + 1}: Connecting {region1[5]} -> {region2[5]}")  # 🔥 Debug 這行

        # **合併區塊**
        visited = set()
        merged_regions = []

        def merge_connected_regions(start_idx):
            print(f"Starting BFS from: {region_candidates[start_idx][5]} at {region_candidates[start_idx][0]}, {region_candidates[start_idx][1]}")

            """ 使用 BFS 來合併所有相連區塊 """
            queue = [start_idx]
            merged_region = list(region_candidates[start_idx])  # 複製初始區塊資訊
            visited.add(start_idx)
            print(f"Merging {region_candidates[start_idx][5]} at {region_candidates[start_idx][0]}, {region_candidates[start_idx][1]}")

            while queue:
                idx = queue.pop(0)
                for neighbor_idx in adjacency_list[idx]:
                    if neighbor_idx not in visited:
                        visited.add(neighbor_idx)
                        queue.append(neighbor_idx)
                        
                        # 合併區塊
                        x1b, y1b, x2b, y2b, text_length, text, text_num = region_candidates[neighbor_idx]
                        merged_region[0] = min(merged_region[0], x1b)
                        merged_region[1] = min(merged_region[1], y1b)
                        merged_region[2] = max(merged_region[2], x2b)
                        merged_region[3] = max(merged_region[3], y2b)
                        merged_region[4] += text_length
                        merged_region[5] += " " + text
                        merged_region[6] += text_num

            return merged_region

        # 執行合併
        for i in range(len(region_candidates)):
            if i not in visited:
                merged_region = merge_connected_regions(i)
                merged_regions.append(merged_region)

        # **篩選高密度區域**
        dense_regions = []
        for region in merged_regions:
            x1, y1, x2, y2, total_text_length, all_text, total_text_num = region
            if total_text_num > density_threshold:
                width = x2 - x1
                height = y2 - y1
                area = width * height
                text_density = total_text_length / area if area > 0 else 0

                if text_density >= 0.0001:
                    dense_regions.append({
                        "page": page_i + 1,
                        "x1": x1, "y1": y1, "x2": x2, "y2": y2,
                        "width": width, "height": height, "area": area,
                        "total_text_length": total_text_length,
                        "text_density": text_density,
                        "all_text": all_text.strip(),
                        "text_num": total_text_num
                    })
        
        # 

        all_dense_regions.extend(dense_regions)

        # **繪製紅色邊框標註區域**
        draw = ImageDraw.Draw(page_image)
        for region in dense_regions:
            x1, y1, x2, y2 = region["x1"], region["y1"], region["x2"], region["y2"]
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

        page_image.save(f'{output_image_path}/page_{page_i + 1}.jpg')

    # **輸出 JSON**
    with open(f'{output_image_path}/dense_regions.json', 'w') as json_file:
        json.dump(all_dense_regions, json_file, indent=4)
    
    with open(f'{output_image_path}/region_candidates.json', 'w') as json_file:
        json.dump(all_region_data, json_file, indent=4, ensure_ascii=False)

In [None]:
process_pdf(pdf_path, output_image_path, density_threshold=15)

NameError: name 'os' is not defined

In [None]:
# # read second_round.json file
# second_round  = json.load(open('C:/Users/r12725056/anaconda3/Master/Greenwashing/CSR_report_processed/NASDAQ/second_round/second_round.json', 'r'))
# for i in range(0, 5):
#     pdf_name = second_round[i]
#     pdf_path = f'../CSR Reporting/NASDAQ/{pdf_name}.pdf'
#     output_image_path = f'../CSR_report_processed_v3/NASDAQ/{pdf_name}/{pdf_name}_v3'
#     if not os.path.exists(output_image_path):
#         os.makedirs(output_image_path)
#     process_pdf(pdf_path, output_image_path, density_threshold=15)
#     print(f"Processed {pdf_name} successfully")