In [10]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import numpy as np
import json
import os
from statistics import quantiles
# from IPython.display import display
# import matplotlib.pyplot as plt

In [11]:
def merge_overlapping_regions(merged_regions, horizontal_threshold, vertical_threshold):
    new_merged = []
    while merged_regions:
        region = merged_regions.pop(0)  # 取出第一個區域
        merged = False
        for i, existing_region in enumerate(new_merged):
            x1, y1, x2, y2, _, _, _, _, _ = existing_region
            x, y, w, h = region[:4]

            # horizontally_close = abs(x - x2) <= horizontal_threshold or abs(x1 - (x + w)) <= horizontal_threshold
            # vertically_close = abs(y - y2) <= vertical_threshold or abs(y1 - (y + h)) <= vertical_threshold
            overlap_horizontally = (x >= x1) and (x <= x2)
            overlap_vertically = (y >= y1) and (y <= y2)

            if overlap_horizontally and overlap_vertically:
                # 合併區域
                new_merged[i][0] = min(new_merged[i][0], x)
                new_merged[i][1] = min(new_merged[i][1], y)
                new_merged[i][2] = max(new_merged[i][2], x + w)
                new_merged[i][3] = max(new_merged[i][3], y + h)
                new_merged[i][4] += region[4]
                new_merged[i][5] += " " + region[5]
                new_merged[i][6] += region[6]
                new_merged[i][7] += region[7]
                new_merged[i][8] += region[8]
                merged = True
                break

        if not merged:
            new_merged.append(region)

    return new_merged


In [12]:

def process_pdf(pdf_path, output_image_path, density_threshold=15):
    
    os.makedirs(output_image_path, exist_ok=True)
    pages = convert_from_path(pdf_path, dpi=300)
    all_dense_regions = []

    # 遍歷指定頁數範圍
    for page_i, page_image in enumerate(pages):
        
        # if page_i+1 < 39:
        #     continue
        # elif page_i+1 > 39:
        #     break
        
        print(f"\n--- Processing Page {page_i + 1} ---")
        draw = ImageDraw.Draw(page_image)
        # page_image = pages[page_i]

        # 提取文字區塊資訊
        data = pytesseract.image_to_data(page_image, output_type=pytesseract.Output.DICT)
        # print(data['text'])
        valid_indices = [
            i for i in range(len(data['text']))
            if data['text'][i].strip() and data['width'][i] > 0 and data['height'][i] > 0
        ]

        # 計算前25%的平均值
        def trimmed_mean(values, trim_ratio=0.3):
            if not values:
                return 0
            sorted_values = np.sort(values)
            trim_count = int(len(sorted_values) * trim_ratio)
            trimmed_values = sorted_values[:trim_count]
            return np.mean(trimmed_values)
        
        def percentage(values, trim_ratio=0.8):
            if not values:
                return 0
            sorted_values = np.sort(values)
            trim_count = int(len(sorted_values) * trim_ratio)
            trimmed_values = sorted_values[trim_count]
            return trimmed_values

        non_zero_widths = [w for w in data['width'] if w > 0]
        non_zero_heights = [h for h in data['height'] if h > 0]
        average_width = percentage(non_zero_widths, trim_ratio=0.5)
        average_height = percentage(non_zero_heights)
        # if valid_indices:
        #     mean = sum(data['height'][i] for i in valid_indices) / len(valid_indices)
        #     std = np.std([data['height'][i] for i in valid_indices])
        #     average_height = mean + std
        # else:
        #     average_height = 0  # 避免除以零

        horizontal_threshold = int(average_width) if not np.isnan(average_width) else 0
        vertical_threshold = int(average_height) if not np.isnan(average_height) else 0

        # print(f"\n--- Debug: Average Dimensions ---")
        # print(f"Average Width: {average_width}")
        # print(f"Average Height: {average_height}")

        # print(f"\n--- Debug: Thresholds ---")
        # print(f"Horizontal Threshold: {horizontal_threshold}")
        # print(f"Vertical Threshold: {vertical_threshold}")

#         if len(data['text']) < 300:
#             continue

        merged_regions = []
        current_region = None

        for i in range(len(data['text'])):
            if data['text'][i].strip():
                x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                text = data['text'][i].strip()
                text_length = len(text)
                text_num = 1
                sum_w = w
                sum_h = h
                updated = False

                best_merge_value = np.inf 
                best_merge_region = None
                for region in merged_regions[-3:]:
                    x1, y1, x2, y2, total_text_length, all_text, total_text_num, sum_w, sum_h = region

                    horizontally_close = abs(x - x2) <= horizontal_threshold or \
                                         abs(x1 - (x + w)) <= horizontal_threshold or \
                                         abs(x - x1) <= horizontal_threshold or \
                                         abs(x2 - (x + w)) <= horizontal_threshold
                                         
                    vertically_close = abs(y - y2) <= vertical_threshold or \
                                       abs(y1 - (y + h)) <= vertical_threshold or \
                                       abs(y - y1) <= vertical_threshold or \
                                       abs(y2 - (y + h)) <= vertical_threshold

                    overlap_horizontally = (x >= x1) and (x <= x2) 
                    overlap_vertically = (y >= y1) and (y <= y2) 
                    
                    # print(f'all_text: {all_text}, x1:{x1}, y1:{y1}, x2: {x2}, y2: {y2}')
                    # print(f'text: {text}, x1:{x}, y1:{y}, x2: {x+w}, y2: {y+h}')
                    # print(f'horizontally_dist: {abs(x - x2)}, {abs(x1 - (x + w))}, {abs(x - x1)}, {abs(x2 - (x + w))}')
                    # print(f'horizontal_threshold: {horizontal_threshold}')
                    # print(f'horizontally_close: {horizontally_close}')
                    # print(f'vertically_dist: {abs(y - y2)}, {abs(y1 - (y + h))}, {abs(y - y1)}, {abs(y2 - (y + h))}')
                    # print(f'vertical_threshold: {vertical_threshold}')
                    # print(f'vertically_close: {vertically_close}')
                    # print('-'*30)
                    
                    
                    if (horizontally_close and vertically_close) or (overlap_horizontally and overlap_vertically):
                        
                        updated = True
                        min_x = min(abs(x - x2), abs(x1 - (x + w)), abs(x - x1), abs(x2 - (x + w)))
                        min_y = min(abs(y - y2), abs(y1 - (y + h)), abs(y - y1), abs(y2 - (y + h)))
                        merge_value = min_x * min_y
                        
                        if overlap_horizontally and overlap_vertically:
                            merge_value = 0
                        
                        if merge_value < best_merge_value:
                            best_merge_value = merge_value
                            best_merge_region = region
                        
                        # print(f'horizontally_close: {horizontally_close}')
                        # print(f'vertically_close: {vertically_close}')
                        # print(f'overlap_horizontally: {overlap_horizontally}, overlap_vertically: {overlap_vertically}')
                        # print(f'all_text: {all_text}')
                        # print(f'text_1: {text}')
                        # print('-'*10)
                        # print(f"Merging Region: {region} with Text: {text}")
                
                if updated and best_merge_region:
                    best_merge_region[0] = min(best_merge_region[0], x)
                    best_merge_region[1] = min(best_merge_region[1], y)
                    best_merge_region[2] = max(best_merge_region[2], x + w)
                    best_merge_region[3] = max(best_merge_region[3], y + h)
                    best_merge_region[4] += text_length
                    best_merge_region[5] += " " + text
                    best_merge_region[6] += text_num
                    best_merge_region[7] += sum_w
                    best_merge_region[8] += sum_h
                    # print(f"Updated Region: {region}")
                    # break

                elif not updated:
                    current_region = [x, y, x + w, y + h, text_length, text, text_num, sum_w, sum_h]
                    merged_regions.append(current_region)


        # merged_regions = merge_overlapping_regions(merged_regions, horizontal_threshold, vertical_threshold)
        
        dense_regions = []
        for region in merged_regions:
            x1, y1, x2, y2, total_text_length, all_text, total_text_num, sum_w, sum_h = region
            if total_text_num > density_threshold:
                width = x2 - x1
                height = y2 - y1
                area = width * height
                text_density = total_text_length / area if area > 0 else 0

                if text_density >= 0:
                # if text_density >= 0.0001:
                    dense_regions.append({
                        "page": page_i + 1,
                        "x1": x1,
                        "y1": y1,
                        "x2": x2,
                        "y2": y2,
                        "width": width,
                        "height": height,
                        "area": area,
                        "total_text_length": total_text_length,
                        "text_density": text_density,
                        "all_text": all_text,
                        "text_num": total_text_num
                    })

        all_dense_regions.extend(dense_regions)
        
        for region in dense_regions:
            x1, y1, x2, y2 = region["x1"], region["y1"], region["x2"], region["y2"]
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
    
        page_image.save(f'{output_image_path}/page_{page_i + 1}.jpg')

    with open(f'{output_image_path}/dense_regions.json', 'w') as json_file:
        json.dump(all_dense_regions, json_file, indent=4)

In [13]:
import os
import re
skip_reports = {"NYSE_LTM_2024", "NASDAQ_SEDG_2022", "NASDAQ_SEDG_2023"}

# 設定資料夾路徑
pdf_folder = os.path.abspath("../CSR Reporting/CSR_report_new_collect/")  # 轉換為絕對路徑
output_folder = os.path.abspath("../CSR_report_processed_v4/CSR_report_new_collect/")  # 轉換為絕對路徑

# 設定要處理的 PDF 數量
num_files_to_process = 10000  # 這裡可以改成你想要的數量

# 取得 PDF 檔案清單
pdf_files = []
for root, _, files in os.walk(pdf_folder):  # os.walk 遞迴遍歷所有子資料夾
    for file in files:
        if file.endswith(".pdf"):
            pdf_files.append(os.path.abspath(os.path.join(root, file)))  # 轉換為絕對路徑

# 篩選未處理的 PDF（如果對應的資料夾不存在或為空）
pdf_files_to_process = []
for pdf_file in pdf_files:
    report_name = os.path.splitext(os.path.basename(pdf_file))[0]  # 只取檔名，去掉副檔名
    output_path = os.path.join(output_folder, report_name)
    if report_name in skip_reports:
        continue  # 跳過指定檔案
        # 只處理 2019 或更早的報告
    match = re.search(r'(\d{4})$', report_name)
    if match:
        year = int(match.group(1))
        if year > 2019:
            continue  # 跳過 2020 以後的
    else:
        continue  # 沒找到年份就跳過

    # 確保該輸出資料夾不存在或為空
    if not os.path.exists(output_path) or not any(f.endswith(".json") for f in os.listdir(output_path)):
        pdf_files_to_process.append(pdf_file)
    


# 限制處理數量
pdf_files_to_process = pdf_files_to_process[:num_files_to_process]

# 依序處理 PDF
for pdf_file in pdf_files_to_process:
    report_name = os.path.splitext(os.path.basename(pdf_file))[0]  # 只取檔名，去掉副檔名
    output_image_path = os.path.join(output_folder, report_name)

    # 確保輸出資料夾存在
    os.makedirs(output_image_path, exist_ok=True)

    try:
        print(f"🚀 處理 PDF: {pdf_file}")  # Debug，確認路徑是否正確
        process_pdf(pdf_file, output_image_path, density_threshold=0)
        print(f"✅ 成功處理 PDF: {pdf_file}")
    except Exception as e:
        print(f"❌ 無法處理 {pdf_file}: {e}")

🚀 處理 PDF: /media/francia/hdd_1/research_hub/csr_project/CSR Reporting/CSR_report_new_collect/NYSE_RACE_2017.pdf

--- Processing Page 1 ---

--- Processing Page 2 ---

--- Processing Page 3 ---

--- Processing Page 4 ---

--- Processing Page 5 ---

--- Processing Page 6 ---

--- Processing Page 7 ---

--- Processing Page 8 ---

--- Processing Page 9 ---

--- Processing Page 10 ---

--- Processing Page 11 ---

--- Processing Page 12 ---

--- Processing Page 13 ---

--- Processing Page 14 ---

--- Processing Page 15 ---

--- Processing Page 16 ---

--- Processing Page 17 ---

--- Processing Page 18 ---

--- Processing Page 19 ---

--- Processing Page 20 ---

--- Processing Page 21 ---

--- Processing Page 22 ---

--- Processing Page 23 ---

--- Processing Page 24 ---

--- Processing Page 25 ---

--- Processing Page 26 ---

--- Processing Page 27 ---

--- Processing Page 28 ---

--- Processing Page 29 ---

--- Processing Page 30 ---

--- Processing Page 31 ---

--- Processing Page 32 ---



In [14]:
import os
# calculate how many folders are in the CSR_report_processed_v4/NASDAQ folder
output_folder = "../CSR_report_processed_v4/CSR_report_new_collect/"
folder_count = 0
for root, dirs, files in os.walk(output_folder):
    for file in files:
        if file.endswith(".json"):
            folder_count += 1
print(f"Total {folder_count} folders are processed.")

Total 1224 folders are processed.


In [15]:
pdf_file

'/media/francia/hdd_1/research_hub/csr_project/CSR Reporting/CSR_report_new_collect/NYSE_LTM_2017.pdf'