# Approach 1: PyMuPDF

In [1]:
# !pip install pymupdf

In [2]:
import fitz
print(fitz.__doc__)

PyMuPDF 1.25.2: Python bindings for the MuPDF 1.25.2 library (rebased implementation).
Python 3.11 running on darwin (64-bit).



In [3]:
import fitz
import os

# Create a folder for images if it doesn't exist
if not os.path.exists("images"):
    os.makedirs("images")
    
def extract_data_from_pdf(sourse_filename, dest_image_folder_name="images", dest_text_filename="output.txt"):
    # Open the PDF document
    doc = fitz.open(sourse_filename)

    # Open a text file to save the extracted text
    with open(dest_text_filename, "w", encoding="utf-8") as text_file:
        for page_number, page in enumerate(doc, start=1):
            # Extract text from the page
            text = page.get_text()
            text_file.write(f"--- Page {page_number} ---\n")
            text_file.write(text)
            text_file.write("\n\n")

            # Extract images from the page
            for img_index, img in enumerate(page.get_images(full=True), start=1):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_extension = base_image["ext"]  # Get image extension (e.g., 'png', 'jpeg')
                image_filename = f"{dest_image_folder_name}/page_{page_number}_image_{img_index}.{image_extension}"

                # Save the image file
                with open(image_filename, "wb") as img_file:
                    img_file.write(image_bytes)

    print(f"Text saved to '{dest_text_filename}' and images saved to the '{dest_image_folder_name}' folder.")

### Extract from PDF with Unstructured Data like images, table, equations

In [4]:
extract_data_from_pdf("PDF_with_unstructured_Data.pdf", "images", "PDF_with_unstructured_Data.txt")

Text saved to 'PDF_with_unstructured_Data.txt' and images saved to the 'images' folder.


### Extract from PDF with text data only

In [5]:
extract_data_from_pdf("Simple Text Data.pdf", "text_images", "text_output.txt")

Text saved to 'text_output.txt' and images saved to the 'text_images' folder.
