In [1]:
import os
import json
import fitz  # PyMuPDF
import pdfplumber
import csv
import pandas as pd
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# MongoDB URI
mongo_uri = os.getenv("MONGO_URI")

# Connect to MongoDB
def connect_mongo():
    try:
        client = MongoClient(mongo_uri)
        client.admin.command('ping')  # Test connection
        print("MongoDB connection successful.")
        return client
    except ConnectionFailure as e:
        print(f"Could not connect to MongoDB: {e}")
        return None


In [2]:

# Function to extract data from a single PDF
def extract_data_from_pdf(pdf_path, output_folder, mongo_collection):
    # Create a dedicated folder for the current PDF
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_output_folder = os.path.join(output_folder, pdf_name)
    os.makedirs(pdf_output_folder, exist_ok=True)

    # Create an images folder inside the PDF output folder
    images_folder = os.path.join(pdf_output_folder, 'images')
    os.makedirs(images_folder, exist_ok=True)

    # Open the PDF with PyMuPDF (fitz) for text and images
    doc = fitz.open(pdf_path)

    # Open the PDF with pdfplumber for table extraction
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(doc)):
            page = doc[page_num]

            # Extract text
            text = page.get_text()
            json_data = {
                "page_number": page_num + 1,
                "text": text
            }

            # Save text to JSON
            json_filename = os.path.join(pdf_output_folder, f'page_{page_num + 1}.json')
            with open(json_filename, 'w') as json_file:
                json.dump(json_data, json_file)

            # Extract images
            image_list = page.get_images(full=True)
            image_filenames = []
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_filename = os.path.join(images_folder, f'image_page_{page_num + 1}_{img_index + 1}.png')

                # Save image
                with open(image_filename, 'wb') as img_file:
                    img_file.write(image_bytes)
                    image_filenames.append(image_filename)

            # Extract tables using pdfplumber
            plumber_page = pdf.pages[page_num]
            tables = plumber_page.extract_tables()
            for i, table in enumerate(tables):
                csv_filename = os.path.join(pdf_output_folder, f'table_page_{page_num + 1}_{i + 1}.csv')
                with open(csv_filename, mode='w', newline='') as file:
                    writer = csv.writer(file)
                    for row in table:
                        writer.writerow(row)

                print(f"Saved table {i+1} on Page {page_num + 1} to {csv_filename}")

                # Insert extracted data into MongoDB
                mongo_collection.insert_one({
                    "pdf_name": pdf_name,
                    "page_number": page_num + 1,
                    "text": text,
                    "images": [os.path.basename(img) for img in image_filenames],  # Store only the image filenames
                    "table_csv": os.path.basename(csv_filename)
                })

    doc.close()

In [3]:

# Function to process all PDFs in a folder
def process_pdfs_in_folder(pdf_folder, output_folder, mongo_collection):
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing {filename}...")
            extract_data_from_pdf(pdf_path, output_folder, mongo_collection)

# Example usage
if __name__ == "__main__":
    client = connect_mongo()
    if client:
        db = client["pdf_database"]  # Replace with your database name
        collection = db["pdf_collection"]  # Replace with your collection name

        pdf_folder = "../extracted"  # Folder containing the PDFs
        output_folder = "./extracted_data/"  # Folder to store extracted data

        process_pdfs_in_folder(pdf_folder, output_folder, collection)


MongoDB connection successful.
Processing e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf...
Saved table 1 on Page 1 to ./extracted_data/e9a2c537-8232-4c3f-85b0-b52de6bcba99/table_page_1_1.csv
Saved table 1 on Page 2 to ./extracted_data/e9a2c537-8232-4c3f-85b0-b52de6bcba99/table_page_2_1.csv
Saved table 1 on Page 3 to ./extracted_data/e9a2c537-8232-4c3f-85b0-b52de6bcba99/table_page_3_1.csv
Processing 32f386b9-73ee-4455-b412-ddad508aa979.pdf...
Saved table 1 on Page 1 to ./extracted_data/32f386b9-73ee-4455-b412-ddad508aa979/table_page_1_1.csv
Processing 634fca59-03b2-4cdf-9ce4-0205df22f256.pdf...
Processing Job Listing.pdf...
Processing be353748-74eb-4904-8f17-f180ce087f1a.pdf...
Saved table 1 on Page 68 to ./extracted_data/be353748-74eb-4904-8f17-f180ce087f1a/table_page_68_1.csv
Saved table 1 on Page 81 to ./extracted_data/be353748-74eb-4904-8f17-f180ce087f1a/table_page_81_1.csv
Processing 8f697523-6988-4c4f-8d72-760a45681f68.pdf...
Processing c4456885-2f03-436f-8fe9-0b4ca6822cdb.pdf...
Saved