# This notebook only extract graph, text, layout from PDF files

### Read PDF Data

In [None]:
import os
PDF_File_Path=r"Data\PDF"
PDF_File_Path=os.path.join(os.path.dirname(os.getcwd()),PDF_File_Path)

PDF_Extract_Output_Path=r"Data/Extract_Output/text"
PDF_Extract_Output_Path=os.path.join(os.path.dirname(os.getcwd()),PDF_Extract_Output_Path)
#in case of no such folder
os.makedirs(PDF_Extract_Output_Path, exist_ok=True)

print("PDF file path: {}\nPDF Extract Output Path: {}".format(PDF_File_Path, PDF_Extract_Output_Path) )

## Text Extraction

### The sample of text extraction (only 1 pdf)

In [None]:
import torch
import base64

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",use_fast=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def extract_text_per_page(pdf_path:str,page_num:int,model_current=model,processor_current=processor):
    image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)

    # Build the prompt, using document metadata
    # anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
    # prompt = build_finetuning_prompt(anchor_text)

    #print(prompt)
    # Build the full prompt
    messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]

    # Apply the chat template and processor
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor_current(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}


    # Generate the output
    output = model_current.generate(
                **inputs,
                temperature=0.8,
                max_new_tokens=10000,
                num_return_sequences=1,
                do_sample=True,
            )

    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor_current.tokenizer.batch_decode(
        new_tokens,
        skip_special_tokens=True
    )
    return text_output
# ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Molmo and PixMo:\\nOpen Weights and Open Data\\nfor State-of-the']


### Extract the natural text from 1 page pdf

In [None]:
import json
# text_output=extract_text_per_page("C:/Users/Tommy/Downloads/ESGreporttemplate/A/2023-CSR-report_e.pdf",5)
# output_data = json.loads(text_output[0])
# print(output_data['natural_text'])
# !python -m olmocr.pipeline ./localworkspace --markdown --pdfs D:\pycharmProjects\File-Multimodel-Analysis\Data\PDF\2023-CSR-report_e.pdf


### Extract all text from all pdfs

In [None]:
import fitz

def list_pdfs_in_folder(path):
    if not os.path.isdir(path):
        raise ValueError(f"this path is not a folder: {path}")

    pdf_paths = {}
    for name in os.listdir(path):
        full = os.path.join(path, name)

        if not os.path.isfile(full):
            raise ValueError(f"this is not a file: {full}")

        if not name.lower().endswith('.pdf'):
            raise ValueError(f"some file is not pdf: {full}")
        pdf_paths[name] = full

    return pdf_paths

def save_dict_to_json(data: dict, filepath: str, *, indent: int = 4, ensure_ascii: bool = False) -> None:
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii)

def extract_text_all_pdf():
    pdf_paths = list_pdfs_in_folder(PDF_File_Path)
    for pdf_name,pdf_path in pdf_paths.items():
        text_output_each_pdf={}
        #remove .pdf, only file name
        name = os.path.splitext(pdf_name)[0]
        text_output_each_pdf["filename"]=name
        text_output_each_pdf["filepath"]=pdf_path
        doc = fitz.open(pdf_path)
        for page_index, page in enumerate(doc, start=1):
            text_output_each_page=extract_text_per_page(pdf_path,page_index,model_current=model,processor_current=processor)
            text_output_each_pdf["{}".format(page_index)] = text_output_each_page
        save_dict_to_json(text_output_each_pdf, os.path.join(PDF_Extract_Output_Path, "{}.json".format(name)))
        print("File has been extracted successfully: {}\n".format(pdf_name))

In [None]:
extract_text_all_pdf()
# print(list_pdfs_in_folder(parent))

## Extract Layout