In [13]:
import fitz  # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

# Function to extract text with coordinates using PyMuPDF
def extract_text_with_coordinates(pdf_path):
    doc = fitz.open(pdf_path)
    data = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block["lines"]:
                for span in line["spans"]:
                    x0, y0, x1, y1 = span["bbox"]
                    text = span["text"]
                    font_size = span["size"]
                    data.append((x0, y0, x1, y1, text, font_size, page_num))
    return data

# Function to create a new PDF with extracted text and coordinates
def create_pdf_with_text_coordinates(output_path, text_data, page_size=letter):
    c = canvas.Canvas(output_path, pagesize=page_size)
    current_page = 0
    for x0, y0, x1, y1, text, font_size, page_num in text_data:
        if page_num != current_page:
            c.showPage()
            current_page = page_num
        c.setFont("Helvetica", font_size)
        c.drawString(x0, page_size[1] - y0, text)  # Adjust y-coordinate for PDF origin
    c.save()

# Paths
input_pdf_path = "ast_sci_data_tables_sample.pdf"
output_pdf_path = "output1.pdf"

# Extract text and coordinates
text_data = extract_text_with_coordinates(input_pdf_path)

# Create a new PDF with the extracted text and coordinates
create_pdf_with_text_coordinates(output_pdf_path, text_data)

print(f"New PDF created at {output_pdf_path}")


New PDF created at output1.pdf
