In [8]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import cv2
import numpy as np
import json
import fitz
import os

# 設置 Tesseract 路徑
pytesseract.pytesseract.tesseract_cmd = r"C:/Users/r12725056/AppData/Local/Programs/Tesseract-OCR/tesseract.exe"
poppler_path = r"C:/Users/r12725056/anaconda3/Master/Greenwashing/poppler-24.08.0/Library/bin"

In [2]:
def process_pdf(pdf_path, output_image_path, density_threshold=15):
    # 將 PDF 第 3 頁轉換為圖片
    pages = convert_from_path(pdf_path, dpi=300)
    all_dense_regions = []

    # 遍歷指定頁數範圍
    for page_i, page_image in enumerate(pages):
        print(f"\n--- Processing Page {page_i + 1} ---")
        # page_image = pages[page_i]

        # 提取文字區塊資訊
        data = pytesseract.image_to_data(page_image, output_type=pytesseract.Output.DICT)
        # print(data['text'])
        valid_indices = [
            i for i in range(len(data['text']))
            if data['text'][i].strip() and data['width'][i] > 0 and data['height'][i] > 0
        ]

        # 計算前25%的平均值
        def trimmed_mean(values, trim_ratio=0.25):
            if not values:
                return 0
            sorted_values = np.sort(values)
            trim_count = int(len(sorted_values) * trim_ratio)
            trimmed_values = sorted_values[:trim_count]
            return np.mean(trimmed_values)

        non_zero_widths = [w for w in data['width'] if w > 0]
        average_width = trimmed_mean(non_zero_widths)
        # average_height = sum(data['height'][i] for i in valid_indices) / len(valid_indices)
        if valid_indices:
            average_height = sum(data['height'][i] for i in valid_indices) / len(valid_indices)
        else:
            average_height = 0  # 避免除以零

        # horizontal_threshold = int(average_width)
        # vertical_threshold = int(average_height)
        horizontal_threshold = int(average_width) if not np.isnan(average_width) else 0
        vertical_threshold = int(average_height) if not np.isnan(average_height) else 0


        # print(f"\n--- Debug: Average Dimensions ---")
        # print(f"Average Width: {average_width}")
        # print(f"Average Height: {average_height}")

        # print(f"\n--- Debug: Thresholds ---")
        # print(f"Horizontal Threshold: {horizontal_threshold}")
        # print(f"Vertical Threshold: {vertical_threshold}")

        # if len(data['text']) < 300:
        #     # print(len(data['text']))
        #     continue

        merged_regions = []
        current_region = None

        for i in range(len(data['text'])):
            if data['text'][i].strip():
                x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                text = data['text'][i].strip()
                text_length = len(text)
                text_num = 1
                # print(f"Processing Text: {text}, text_length: {text_length}")

                # print(f"Processing Text: {text}, Position: ({x}, {y}), Size: ({w}, {h})")

                updated = False

                for region in merged_regions:
                    x1, y1, x2, y2, total_text_length, all_text, total_text_num = region

                    horizontally_close = abs(x - x2) <= horizontal_threshold or \
                                         abs(x1 - (x + w)) <= horizontal_threshold
                    vertically_close = abs(y - y2) <= vertical_threshold or \
                                       abs(y1 - (y + h)) <= vertical_threshold

                    overlap_vertically = not (y > y2 or (y + h) < y1)
                    overlap_horizontally = not (x > x2 or (x + w) < x1)

                    if (horizontally_close or overlap_horizontally) and (vertically_close or overlap_vertically):
                        # print(f"Merging Region: {region} with Text: {text}")

                        region[0] = min(region[0], x)
                        region[1] = min(region[1], y)
                        region[2] = max(region[2], x + w)
                        region[3] = max(region[3], y + h)
                        region[4] += text_length
                        region[5] += " " + text
                        region[6] += text_num
                        # print(f"Updated Region: {region}")
                        updated = True
                        break

                if not updated:
                    if current_region is None:
                        current_region = [x, y, x + w, y + h, text_length, text, text_num]
                        # print(f"Creating New Current Region: {current_region}")
                    else:
                        horizontally_close = abs(x - current_region[2]) <= horizontal_threshold or \
                                             abs(current_region[0] - (x + w)) <= horizontal_threshold
                        vertically_close = abs(y - current_region[3]) <= vertical_threshold or \
                                           abs(current_region[1] - (y + h)) <= vertical_threshold

                        overlap_vertically = not (y > current_region[3] or (y + h) < current_region[1])
                        overlap_horizontally = not (x > current_region[2] or (x + w) < current_region[0])

                        if (horizontally_close or overlap_horizontally) and (vertically_close or overlap_vertically):
                            current_region[0] = min(current_region[0], x)
                            current_region[1] = min(current_region[1], y)
                            current_region[2] = max(current_region[2], x + w)
                            current_region[3] = max(current_region[3], y + h)
                            current_region[4] += text_length
                            current_region[5] += " " + text
                            current_region[6] += text_num
                            # print(f"Updated Current Region: {current_region}")
                        else:
                            merged_regions.append(current_region)
                            # print(f"Finalized Region: {current_region}")
                            current_region = [x, y, x + w, y + h, text_length, text, text_num]
                            # print(f"Creating New Current Region: {current_region}")

        if current_region:
            merged_regions.append(current_region)
            # print(f"Finalized Last Region: {current_region}")

        dense_regions = []
        for region in merged_regions:
            x1, y1, x2, y2, total_text_length, all_text, total_text_num = region
            if total_text_num > density_threshold:
                width = x2 - x1
                height = y2 - y1
                area = width * height
                text_density = total_text_length / area if area > 0 else 0

                if text_density >= 0.0001:
                    dense_regions.append({
                        "page": page_i + 1,
                        "x1": x1,
                        "y1": y1,
                        "x2": x2,
                        "y2": y2,
                        "width": width,
                        "height": height,
                        "area": area,
                        "total_text_length": total_text_length,
                        "text_density": text_density,
                        "all_text": all_text,
                        "text_num": total_text_num
                    })

        all_dense_regions.extend(dense_regions)

        draw = ImageDraw.Draw(page_image)
        for region in dense_regions:
            x1, y1, x2, y2 = region["x1"], region["y1"], region["x2"], region["y2"]
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

        page_image.save(f'{output_image_path}/page_{page_i + 1}.jpg')

    with open(f'{output_image_path}/dense_regions.json', 'w') as json_file:
        json.dump(all_dense_regions, json_file, indent=4)

In [3]:
# read second_round.json file
second_round  = json.load(open('C:/Users/r12725056/anaconda3/Master/Greenwashing/CSR_report_processed/NASDAQ/second_round/second_round.json', 'r'))
for i in range(22, len(second_round)):
    pdf_name = second_round[i]
    pdf_path = f'../CSR Reporting/NASDAQ/{pdf_name}.pdf'
    output_image_path = f'../CSR_report_processed/NASDAQ/{pdf_name}/{pdf_name}_v2'
    if not os.path.exists(output_image_path):
        os.makedirs(output_image_path)
    process_pdf(pdf_path, output_image_path, density_threshold=15)
    print(f"Processed {pdf_name} successfully")


--- Processing Page 1 ---

--- Processing Page 2 ---


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



--- Processing Page 3 ---

--- Processing Page 4 ---

--- Processing Page 5 ---

--- Processing Page 6 ---

--- Processing Page 7 ---

--- Processing Page 8 ---

--- Processing Page 9 ---

--- Processing Page 10 ---

--- Processing Page 11 ---

--- Processing Page 12 ---

--- Processing Page 13 ---

--- Processing Page 14 ---

--- Processing Page 15 ---
Processed NASDAQ_AEGN_2004 successfully

--- Processing Page 1 ---

--- Processing Page 2 ---

--- Processing Page 3 ---

--- Processing Page 4 ---

--- Processing Page 5 ---

--- Processing Page 6 ---

--- Processing Page 7 ---

--- Processing Page 8 ---

--- Processing Page 9 ---

--- Processing Page 10 ---

--- Processing Page 11 ---

--- Processing Page 12 ---

--- Processing Page 13 ---

--- Processing Page 14 ---

--- Processing Page 15 ---

--- Processing Page 16 ---

--- Processing Page 17 ---

--- Processing Page 18 ---

--- Processing Page 19 ---

--- Processing Page 20 ---

--- Processing Page 21 ---

--- Processing Page 22 




--- Processing Page 1 ---

--- Processing Page 2 ---

--- Processing Page 3 ---

--- Processing Page 4 ---

--- Processing Page 5 ---

--- Processing Page 6 ---

--- Processing Page 7 ---

--- Processing Page 8 ---

--- Processing Page 9 ---

--- Processing Page 10 ---

--- Processing Page 11 ---

--- Processing Page 12 ---

--- Processing Page 13 ---

--- Processing Page 14 ---

--- Processing Page 15 ---

--- Processing Page 16 ---

--- Processing Page 17 ---
Processed NASDAQ_FB_2019 successfully

--- Processing Page 1 ---

--- Processing Page 2 ---

--- Processing Page 3 ---

--- Processing Page 4 ---

--- Processing Page 5 ---

--- Processing Page 6 ---

--- Processing Page 7 ---

--- Processing Page 8 ---

--- Processing Page 9 ---

--- Processing Page 10 ---

--- Processing Page 11 ---

--- Processing Page 12 ---

--- Processing Page 13 ---

--- Processing Page 14 ---

--- Processing Page 15 ---

--- Processing Page 16 ---

--- Processing Page 17 ---

--- Processing Page 18 ---
