In [7]:
import torch
import base64
import urllib.request

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

"""
# Grab a sample PDF
urllib.request.urlretrieve("https://molmo.allenai.org/paper.pdf", "./paper.pdf")
# Render page 1 to an image
image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1024)
# Build the prompt, using document metadata
anchor_text = get_anchor_text("./paper.pdf", 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
"""

# Grab a sample PDF
pdf_path = "./SASA.pdf"
# Render page 1 to an image
image_base64 = render_pdf_to_base64png("./SASA.pdf", 3, target_longest_image_dim=1024)
# Build the prompt, using document metadata
anchor_text = get_anchor_text("./SASA.pdf", 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)

# Build the full prompt
messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                ],
            }
        ]

# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

inputs = processor(
    text=[text],
    images=[main_image],
    padding=True,
    return_tensors="pt",
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}

# Generate the output
output = model.generate(
            **inputs,
            temperature=0.8,
            max_new_tokens=50,
            num_return_sequences=1,
            do_sample=True,
        )

# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
    new_tokens, skip_special_tokens=True
)

# Save the output to a new text file with the PDF name appended with the page number
output_filename = f"{pdf_path.split('/')[-1].split('.')[0]}_page_3_output.txt"
with open(output_filename, 'w') as f:
    f.write(output[0])

print(text_output)
# ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Molmo and PixMo:\\nOpen Weights and Open Data\\nfor State-of-the']

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

TypeError: write() argument must be str, not Tensor

In [8]:
import os
import torch
import base64
import urllib.request

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Specify the source folder containing PDF files
source_folder = "./source"

# Iterate through all PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(source_folder, filename)
        print(f"Processing file: {pdf_path}")
        
        # Render the first page of the PDF to a base64-encoded PNG image
        try:
            image_base64 = render_pdf_to_base64png(pdf_path, 1, target_longest_image_dim=1024)
        except Exception as e:
            print(f"Error rendering PDF {filename}: {e}")
            continue
        
        # Extract anchor text from the first page of the PDF
        try:
            anchor_text = get_anchor_text(pdf_path, 1, pdf_engine="pdfreport", target_length=4000)
        except Exception as e:
            print(f"Error extracting anchor text from {filename}: {e}")
            continue
        
        # Build the prompt using document metadata
        prompt = build_finetuning_prompt(anchor_text)
        
        # Build the full prompt with text and image content
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                ],
            }
        ]
        
        # Apply the chat template and processor
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # Decode the base64 image to a PIL Image object
        try:
            main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
        except Exception as e:
            print(f"Error decoding image for {filename}: {e}")
            continue
        
        # Prepare inputs for the model
        inputs = processor(
            text=[text],
            images=[main_image],
            padding=True,
            return_tensors="pt",
        )
        inputs = {key: value.to(device) for (key, value) in inputs.items()}
        
        # Generate output from the model
        try:
            output = model.generate(
                **inputs,
                temperature=0.8,
                max_new_tokens=50,
                num_return_sequences=1,
                do_sample=True,
            )
            
            # Decode the output tokens into text
            prompt_length = inputs["input_ids"].shape[1]
            new_tokens = output[:, prompt_length:]
            text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
            
            print(f"Output for {filename}: {text_output}")
        
        except Exception as e:
            print(f"Error generating output for {filename}: {e}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing file: ./source/SASA.pdf
Output for SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item | Vendor Article No | Article No | Article Description | Article Status | Color | Item']


In [13]:
import os
import torch
import base64
import urllib.request

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from pdf2image import convert_from_path

from PyPDF2 import PdfReader
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Specify the source folder containing PDF files
source_folder = "./source"

# Iterate through all PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(source_folder, filename)
        print(f"Processing file: {pdf_path}")
        
        # Determine the number of pages in the PDF
        pdf_reader = PdfReader(pdf_path)
        num_pages = len(pdf_reader.pages)
        
        # Process each page of the PDF
        for page_num in range(1, num_pages + 1):
            print(f"Processing page {page_num} of {filename}")
            
            # Render the current page to an image
            images = convert_from_path(pdf_path)
            image = images[page_num - 1]  # Adjust for 0-based indexing
            
            # Optionally resize the image to fit within target dimensions
            width, height = image.size
            target_longest_image_dim = 1024
            ratio = min(target_longest_image_dim / width, target_longest_image_dim / height)
            new_size = (int(width * ratio), int(height * ratio))
            image = image.resize(new_size)
            
            # Convert the image to base64
            buffered = BytesIO()
            image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
            
            # Extract anchor text from the current page of the PDF
            try:
                anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
            except Exception as e:
                print(f"Error extracting anchor text from page {page_num} of {filename}: {e}")
                continue
            
            # Build the prompt using document metadata
            prompt = build_finetuning_prompt(anchor_text)
            
            # Build the full prompt with text and image content
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]
            
            # Apply the chat template and processor
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            # Decode the base64 image to a PIL Image object
            try:
                main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
            except Exception as e:
                print(f"Error decoding image for page {page_num} of {filename}: {e}")
                continue
            
            # Prepare inputs for the model
            inputs = processor(
                text=[text],
                images=[main_image],
                padding=True,
                return_tensors="pt",
            )
            inputs = {key: value.to(device) for (key, value) in inputs.items()}
            
            # Generate output from the model
            try:
                output = model.generate(
                    **inputs,
                    temperature=0.8,
                    max_new_tokens=50,
                    num_return_sequences=1,
                    do_sample=True,
                )
                
                # Decode the output tokens into text
                prompt_length = inputs["input_ids"].shape[1]
                new_tokens = output[:, prompt_length:]
                text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
                
                print(f"Output for page {page_num} of {filename}: {text_output}")
            
            except Exception as e:
                print(f"Error generating output for page {page_num} of {filename}: {e}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing file: ./source/SASA.pdf
Processing page 1 of SASA.pdf
Output for page 1 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item | Vendor Article No | Article No | Article Status | Color | Item category | Size']
Processing page 2 of SASA.pdf
Output for page 2 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item Description | Article No | Article Status | Color | Item category | Size | Delivery Date']
Processing page 3 of SASA.pdf
Output for page 3 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Purchase Order\\n\\nPO Number : 45028888160']


In [15]:
import os
import torch
import base64
import urllib.request
import camelot

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from pdf2image import convert_from_path

from PyPDF2 import PdfReader
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Specify the source folder containing PDF files
source_folder = "./source"

# Iterate through all PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(source_folder, filename)
        print(f"Processing file: {pdf_path}")
        
        # Determine the number of pages in the PDF
        pdf_reader = PdfReader(pdf_path)
        num_pages = len(pdf_reader.pages)
        
        # Process each page of the PDF
        for page_num in range(1, num_pages + 1):
            print(f"Processing page {page_num} of {filename}")
            
            # Render the current page to an image
            images = convert_from_path(pdf_path)
            image = images[page_num - 1]  # Adjust for 0-based indexing
            
            # Optionally resize the image to fit within target dimensions
            width, height = image.size
            target_longest_image_dim = 1024
            ratio = min(target_longest_image_dim / width, target_longest_image_dim / height)
            new_size = (int(width * ratio), int(height * ratio))
            image = image.resize(new_size)
            
            # Convert the image to base64
            buffered = BytesIO()
            image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
            
            # Extract anchor text from the current page of the PDF
            try:
                anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
            except Exception as e:
                print(f"Error extracting anchor text from page {page_num} of {filename}: {e}")
                continue
            
            # Build the prompt using document metadata
            prompt = build_finetuning_prompt(anchor_text)
            
            # Build the full prompt with text and image content
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]
            
            # Apply the chat template and processor
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            # Decode the base64 image to a PIL Image object
            try:
                main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
            except Exception as e:
                print(f"Error decoding image for page {page_num} of {filename}: {e}")
                continue
            
            # Prepare inputs for the model
            inputs = processor(
                text=[text],
                images=[main_image],
                padding=True,
                return_tensors="pt",
            )
            inputs = {key: value.to(device) for (key, value) in inputs.items()}
            
            # Generate output from the model
            try:
                output = model.generate(
                    **inputs,
                    temperature=0.8,
                    max_new_tokens=50,
                    num_return_sequences=1,
                    do_sample=True,
                )
                
                # Decode the output tokens into text
                prompt_length = inputs["input_ids"].shape[1]
                new_tokens = output[:, prompt_length:]
                text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
                
                print(f"Output for page {page_num} of {filename}: {text_output}")
                
                # Check if the output indicates a table
                if '"is_table": true' in str(text_output):
                    # Extract tables using camelot
                    tables = camelot.read_pdf(pdf_path, pages=str(page_num))
                    
                    # Export the first table to CSV
                    if tables:
                        tables[0].to_csv(f"table_page_{page_num}.csv")
                        print(f"Table extracted from page {page_num} and saved to CSV.")
            
            except Exception as e:
                print(f"Error generating output for page {page_num} of {filename}: {e}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing file: ./source/SASA.pdf
Processing page 1 of SASA.pdf
Output for page 1 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item | Vendor Article No | Article No | Article Status | Color | Item category | Size']
Processing page 2 of SASA.pdf
Output for page 2 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item Description | Article No | Article Status | Color | Item category | Size | Delivery Date']
Processing page 3 of SASA.pdf
Output for page 3 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"1. Terms and Conditions (T&C): Unless otherwise agreed in writing between the parties, these']


In [16]:
import os
import torch
import base64
import urllib.request
import camelot

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from pdf2image import convert_from_path

from PyPDF2 import PdfReader
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Specify the source folder containing PDF files
source_folder = "./source"

# Iterate through all PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(source_folder, filename)
        print(f"Processing file: {pdf_path}")
        
        # Determine the number of pages in the PDF
        pdf_reader = PdfReader(pdf_path)
        num_pages = len(pdf_reader.pages)
        
        # Process each page of the PDF
        for page_num in range(1, num_pages + 1):
            print(f"Processing page {page_num} of {filename}")
            
            # Render the current page to an image
            images = convert_from_path(pdf_path)
            image = images[page_num - 1]  # Adjust for 0-based indexing
            
            # Optionally resize the image to fit within target dimensions
            width, height = image.size
            target_longest_image_dim = 1024
            ratio = min(target_longest_image_dim / width, target_longest_image_dim / height)
            new_size = (int(width * ratio), int(height * ratio))
            image = image.resize(new_size)
            
            # Convert the image to base64
            buffered = BytesIO()
            image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
            
            # Extract anchor text from the current page of the PDF
            try:
                anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
            except Exception as e:
                print(f"Error extracting anchor text from page {page_num} of {filename}: {e}")
                continue
            
            # Build the prompt using document metadata
            prompt = build_finetuning_prompt(anchor_text)
            
            # Build the full prompt with text and image content
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]
            
            # Apply the chat template and processor
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            # Decode the base64 image to a PIL Image object
            try:
                main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
            except Exception as e:
                print(f"Error decoding image for page {page_num} of {filename}: {e}")
                continue
            
            # Prepare inputs for the model
            inputs = processor(
                text=[text],
                images=[main_image],
                padding=True,
                return_tensors="pt",
            )
            inputs = {key: value.to(device) for (key, value) in inputs.items()}
            
            # Generate output from the model
            try:
                output = model.generate(
                    **inputs,
                    temperature=0.8,
                    max_new_tokens=50,
                    num_return_sequences=1,
                    do_sample=True,
                )
                
                # Decode the output tokens into text
                prompt_length = inputs["input_ids"].shape[1]
                new_tokens = output[:, prompt_length:]
                text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
                
                print(f"Output for page {page_num} of {filename}: {text_output}")
                
                # Check if the output indicates a table
                if '"is_table": true' in str(text_output):
                    # Extract tables using camelot
                    try:
                        tables = camelot.read_pdf(pdf_path, pages=str(page_num))
                        
                        # Export the first table to CSV
                        if tables:
                            tables[0].to_csv(f"table_page_{page_num}.csv")
                            print(f"Table extracted from page {page_num} and saved to CSV.")
                    except Exception as e:
                        print(f"Error extracting table from page {page_num} of {filename}: {e}")
            
            except Exception as e:
                print(f"Error generating output for page {page_num} of {filename}: {e}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing file: ./source/SASA.pdf
Processing page 1 of SASA.pdf
Output for page 1 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item | Vendor Article No | Article No | Article Status | Color | Item category | Size']
Processing page 2 of SASA.pdf
Output for page 2 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item Description | Article No | Article Status | Color | Item category | Size | Delivery Date']
Processing page 3 of SASA.pdf
Output for page 3 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Purchase Order\\n\\nPO Number : 45028888160']


In [23]:
import os
import torch
import base64
import urllib.request
import camelot
import re  # Import the re module for regular expressions

from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from pdf2image import convert_from_path

from PyPDF2 import PdfReader
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text

# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Specify the source folder containing PDF files
source_folder = "./source"

# Iterate through all PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(source_folder, filename)
        print(f"Processing file: {pdf_path}")
        
        # Determine the number of pages in the PDF
        pdf_reader = PdfReader(pdf_path)
        num_pages = len(pdf_reader.pages)
        
        # Initialize a list to store natural text from all pages
        all_natural_text = []
        
        # Process each page of the PDF
        for page_num in range(1, num_pages + 1):
            print(f"Processing page {page_num} of {filename}")
            
            # Render the current page to an image
            images = convert_from_path(pdf_path)
            image = images[page_num - 1]  # Adjust for 0-based indexing
            
            # Optionally resize the image to fit within target dimensions
            width, height = image.size
            target_longest_image_dim = 1024
            ratio = min(target_longest_image_dim / width, target_longest_image_dim / height)
            new_size = (int(width * ratio), int(height * ratio))
            image = image.resize(new_size)
            
            # Convert the image to base64
            buffered = BytesIO()
            image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
            
            # Extract anchor text from the current page of the PDF
            try:
                anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
            except Exception as e:
                print(f"Error extracting anchor text from page {page_num} of {filename}: {e}")
                continue
            
            # Build the prompt using document metadata
            prompt = build_finetuning_prompt(anchor_text)
            
            # Build the full prompt with text and image content
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                    ],
                }
            ]
            
            # Apply the chat template and processor
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            # Decode the base64 image to a PIL Image object
            try:
                main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
            except Exception as e:
                print(f"Error decoding image for page {page_num} of {filename}: {e}")
                continue
            
            # Prepare inputs for the model
            inputs = processor(
                text=[text],
                images=[main_image],
                padding=True,
                return_tensors="pt",
            )
            inputs = {key: value.to(device) for (key, value) in inputs.items()}
            
            # Generate output from the model
            try:
                output = model.generate(
                    **inputs,
                    temperature=0.8,
                    max_new_tokens=50,
                    num_return_sequences=1,
                    do_sample=True,
                )
                
                # Decode the output tokens into text
                prompt_length = inputs["input_ids"].shape[1]
                new_tokens = output[:, prompt_length:]
                text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
                
                print(f"Output for page {page_num} of {filename}: {text_output}")
                
                # Manually extract natural text from the output
                for output_text in text_output:
                    if output_text:
                        match = re.search(r'"natural_text":"([^"]*)"', output_text)
                        if match:
                            natural_text = match.group(1)
                            all_natural_text.append(natural_text)
                        else:
                            # If parsing fails, append the full output text as a fallback
                            all_natural_text.append(output_text)
                            print(f"Error parsing natural text from page {page_num} of {filename}. Using full output text instead.")
            
            except Exception as e:
                print(f"Error generating output for page {page_num} of {filename}: {e}")
        
        # Save or print all natural text
        print(f"All natural text from {filename}:")
        for i, natural_text in enumerate(all_natural_text):
            print(f"Page {i+1}: {natural_text}")
        
        # Optionally save all natural text to a file
        with open(f"{os.path.splitext(filename)[0]}_natural_text.txt", "w") as f:
            for natural_text in all_natural_text:
                f.write(natural_text + "\n\n")
        print(f"Natural text saved to {os.path.splitext(filename)[0]}_natural_text.txt")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing file: ./source/SASA.pdf
Processing page 1 of SASA.pdf
Output for page 1 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item | Vendor Article No | Article No | Article Status | Color | Item category | Size']
Error parsing natural text from page 1 of SASA.pdf. Using full output text instead.
Processing page 2 of SASA.pdf
Output for page 2 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":true,"is_diagram":false,"natural_text":"| Item Description | Article No | Article Status | Color | Item category | Size | Delivery Date']
Error parsing natural text from page 2 of SASA.pdf. Using full output text instead.
Processing page 3 of SASA.pdf
Output for page 3 of SASA.pdf: ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"1. Terms and Conditions (T&C): Unless otherw