# This notebook only extract graph, text, layout from PDF files

In [1]:
# !pip install PyMuPDF
# !pip install doclayout-yolo==0.0.4
# !pip install olmocr

### Read PDF Data

In [2]:
import os

# from paddle.static.amp.fp16_lists import white_list

PDF_File_Path=r"Data\PDF"
PDF_File_Path=os.path.join(os.path.dirname(os.getcwd()),PDF_File_Path)

PDF_Text_Extract_Output_Path= r"Data/Extract_Output/text"
PDF_Text_Extract_Output_Path=os.path.join(os.path.dirname(os.getcwd()), PDF_Text_Extract_Output_Path)
#in case of no such folder
os.makedirs(PDF_Text_Extract_Output_Path, exist_ok=True)

PDF_Layout_Extract_Output_Path= r"Data/Extract_Output/layout"
PDF_Layout_Extract_Output_Path=os.path.join(os.path.dirname(os.getcwd()), PDF_Layout_Extract_Output_Path)
#in case of no such folder
os.makedirs(PDF_Layout_Extract_Output_Path, exist_ok=True)

print("PDF file path: {}\nPDF Extract Output Path: {}".format(PDF_File_Path, PDF_Text_Extract_Output_Path))

PDF file path: D:\pycharmProjects\File-Multimodel-Analysis\Data\PDF
PDF Extract Output Path: D:\pycharmProjects\File-Multimodel-Analysis\Data/Extract_Output/text


## Text Extraction

### The sample of text extraction (only 1 pdf)

In [3]:
import torch
import base64

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",use_fast=True)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

def extract_text_per_page(pdf_path:str,page_num:int,model_current=model,processor_current=processor):
    image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)

    # Build the prompt, using document metadata
    # anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
    # prompt = build_finetuning_prompt(anchor_text)

    #print(prompt)
    # Build the full prompt
    messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]

    # Apply the chat template and processor
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor_current(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}


    # Generate the output
    output = model_current.generate(
                **inputs,
                temperature=0.8,
                max_new_tokens=8000,
                num_return_sequences=1,
                do_sample=True,
            )

    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor_current.tokenizer.batch_decode(
        new_tokens,
        skip_special_tokens=True
    )
    return text_output
# ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Molmo and PixMo:\\nOpen Weights and Open Data\\nfor State-of-the']


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 57.77it/s]
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


### Extract the natural text from 1 page pdf

In [4]:
import json
# text_output=extract_text_per_page("C:/Users/Tommy/Downloads/ESGreporttemplate/A/2023-CSR-report_e.pdf",5)
# output_data = json.loads(text_output[0])
# print(output_data['natural_text'])
# !python -m olmocr.pipeline ./localworkspace --markdown --pdfs D:\pycharmProjects\File-Multimodel-Analysis\Data\PDF\2023-CSR-report_e.pdf


### Extract all text from all pdfs

In [5]:
import fitz
from Tools.Basic_Functions import *
def extract_text_all_pdf():
    pdf_paths = list_pdfs_in_folder(PDF_File_Path)
    for pdf_name,pdf_path in pdf_paths.items():
        text_output_each_pdf={}
        #remove .pdf, only file name
        name = os.path.splitext(pdf_name)[0]
        text_output_each_pdf["filefullname"]=pdf_name
        text_output_each_pdf["filename"]=name
        text_output_each_pdf["filepath"]=pdf_path
        content_output_each_pdf={}
        doc = fitz.open(pdf_path)
        # get text from each page of pdf
        for page_index, page in enumerate(doc, start=1):
            text_output_each_page=extract_text_per_page(pdf_path,page_index,model_current=model,processor_current=processor)
            content_output_each_pdf["{}".format(page_index)] = text_output_each_page
            print(f"{pdf_name} page {page_index} has been extracted.")
        text_output_each_pdf["content"]=content_output_each_pdf
        #save the text
        save_dict_to_json(text_output_each_pdf, os.path.join(PDF_Text_Extract_Output_Path, "{}.json".format(name)))
        print("File has been extracted successfully: {}\n".format(pdf_name))

In [None]:
extract_text_all_pdf()
# print(list_pdfs_in_folder(parent))

2023-CSR-report_e.pdf page 1 has been extracted.


# Layout Extraction

In [None]:
from doclayout_yolo import YOLOv10
import numpy as np
import cv2
from pdf2image import convert_from_path

model = YOLOv10("../Tools/doclayout_yolo_docstructbench_imgsz1024.pt")

class detect_object:
    def __init__(self, clas_name, x1, y1, x2, y2, conf):
        self.clas_name = clas_name
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.conf = conf

    def class_to_dict(self):
        result_dict={}
        result_dict["class_name"]=self.clas_name
        result_dict["x1"]=str(self.x1)
        result_dict["y1"]=str(self.y1)
        result_dict["x2"]=str(self.x2)
        result_dict["y2"]=str(self.y2)
        result_dict["conf"]=str(self.conf)
        return result_dict


def extract_layout_per_pdf(pdf_name:str,pdf_path:str):
    name = os.path.splitext(pdf_name)[0]
    pages = convert_from_path(pdf_path, dpi=300, fmt='png')
    det_res = model.predict(
    pages,   # Image to predict
    imgsz=1024,        # Prediction image size
    conf=0.2,          # Confidence threshold
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")    # Device to use (e.g., 'cuda:0' or 'cpu')
    )
    content_of_pdf={}
    content_of_pdf["filename"]=pdf_name
    content_of_pdf["filepath"]=pdf_path
    obj_per_page={}
    for page_index,res in enumerate(det_res, start=1):
        obj_list=[]
        for box in res.boxes.data.cpu().numpy():
            x1, y1, x2, y2, conf, cls_id = box
            cls_name = res.names[int(cls_id)]
            det_obj = detect_object(cls_name, x1, y1, x2, y2, conf)
            obj_list.append(det_obj.class_to_dict())
            # print(f"Page {page_index}: {cls_name:<15} [{int(x1)}, {int(y1)}, {int(x2)}, {int(y2)}], conf={conf:.2f}")
        obj_per_page[f"{page_index}"] = obj_list

        #draw the images
        annotated = res.plot(pil=True, line_width=3, font_size=16)
        arr = np.array(annotated)[:, :, ::-1]  # PIL 是 RGB，OpenCV 用 BGR 顺序

        #define images store path
        image_output_path=os.path.join(PDF_Layout_Extract_Output_Path,"images/",f"{name}")
        os.makedirs(image_output_path, exist_ok=True)
        saveOrNot=cv2.imwrite(f"{image_output_path}/page{page_index}.jpg", arr, [cv2.IMWRITE_JPEG_QUALITY, 90]) #save the image
        if not saveOrNot:
            print(f"{page_index} page {page_index} saved failed")

    #store the dictionary as json file
    content_of_pdf["obj_detected"] = obj_per_page
    json_save_path=os.path.join(PDF_Layout_Extract_Output_Path,"jsons")
    os.makedirs(json_save_path, exist_ok=True)
    # print(json_save_path)
    save_dict_to_json(content_of_pdf, os.path.join(json_save_path, "{}.json".format(name)))


def extract_layout_all_pdf():
    # white_list=["INNOLUX%202023%20ESG%20Report%20e-book%20EN_768472.pdf","Stellantis-2023-CSR-Report.pdf","Sustainability-Report-2023-Final-Version-2.pdf"]
    pdf_paths = list_pdfs_in_folder(PDF_File_Path)
    for pdf_name,pdf_path in pdf_paths.items():
        # if pdf_name in white_list:
        extract_layout_per_pdf(pdf_name=pdf_name,pdf_path=pdf_path)

In [None]:
# extract_layout_per_pdf(pdf_path="C:/Users/Tommy/Downloads/ESGreporttemplate/A/2023-CSR-report_e.pdf",pdf_name="2023-CSR-report_e.pdf")

## Extract layout information(title,figure,table) from all pdf

In [None]:
#extract_layout_all_pdf()