In [1]:
pip install --upgrade pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.11-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-macosx_11_0_arm64.whl (18.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.11
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (22 kB)
Downloading pymongo-4.10.1-cp310-cp310-macosx_11_0_arm64.whl (835 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m835.7/835.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-4.10.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import json
import pymupdf
import csv
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# MongoDB URI
mongo_uri = os.getenv("MONGO_URI")

def connect_mongo():
    try:
        client = MongoClient(mongo_uri)
        client.admin.command('ping')
        print("MongoDB connection successful.")
        return client
    except ConnectionFailure as e:
        print(f"Could not connect to MongoDB: {e}")
        return None

In [None]:

def extract_tables(page):
    tables = []
    for table in page.find_tables():
        tables.append(table.extract())
    return tables

def save_table(table, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(table)

In [1]:



def extract_data_from_pdf(pdf_path, output_folder, mongo_collection):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_output_folder = os.path.join(output_folder, pdf_name)
    os.makedirs(pdf_output_folder, exist_ok=True)

    images_folder = os.path.join(pdf_output_folder, "extracted_images")
    os.makedirs(images_folder, exist_ok=True)

    doc = pymupdf.open(pdf_path)
    all_pages_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        
        # Extract tables
        tables = extract_tables(page)
        table_filenames = []
        for i, table in enumerate(tables):
            csv_filename = os.path.join(pdf_output_folder, f'table_page_{page_num + 1}_{i + 1}.csv')
            relative_csv_filename = os.path.relpath(csv_filename, output_folder)
            table_filenames.append(relative_csv_filename)
            save_table(table, csv_filename)

        # Extract images
        image_filenames = []
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = os.path.join(images_folder, f'image_page_{page_num + 1}_{img_index + 1}.{image_ext}')
            relative_image_filename = os.path.relpath(image_filename, output_folder)
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
            image_filenames.append(relative_image_filename)

        page_data = {
            "page_number": page_num + 1,
            "text": text,
            "images": image_filenames,
            "tables": table_filenames
        }
        all_pages_data.append(page_data)

    pdf_data = {
        "pdf_name": pdf_name,
        "pages": all_pages_data
    }

    mongo_collection.insert_one(pdf_data)
    doc.close()

def process_pdfs_in_folder(pdf_folder, output_folder, mongo_collection):
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing {filename}...")
            extract_data_from_pdf(pdf_path, output_folder, mongo_collection)

if __name__ == "__main__":
    client = connect_mongo()
    if client:
        db = client["pdf_database"]
        collection = db["pdf_collection_one_last_Test"]

        pdf_folder = "../extracted"
        output_folder = "./extracted_data_one_last_test/"

        process_pdfs_in_folder(pdf_folder, output_folder, collection)

MongoDB connection successful.
Processing e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf...
Processing 32f386b9-73ee-4455-b412-ddad508aa979.pdf...
Processing 634fca59-03b2-4cdf-9ce4-0205df22f256.pdf...
Processing Job Listing.pdf...
Processing 8f697523-6988-4c4f-8d72-760a45681f68.pdf...
Processing c4456885-2f03-436f-8fe9-0b4ca6822cdb.pdf...
Processing d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.pdf...
Processing 021a5339-744f-42b7-bd9b-9368b3efda7a.pdf...
Processing 680d7d77-c0c7-49c8-88fd-f8ec623645e9.pdf...
Processing b3654e47-4307-442c-a09c-945b33b913c6.pdf...
Processing 67e8878b-5cef-4375-804e-e6291fdbe78a.pdf...
Processing 4044eab7-1282-42bd-a559-3bf3a4d5858e.pdf...
Processing 7c215d46-91c7-424e-9f22-37d43ab73ea6.pdf...
Processing 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf...
Processing ca0a4c14-4b97-43e7-8923-539d61050ae3.pdf...
