In [None]:
import os
import requests
import csv
import fitz  
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set API key for pdf.co
API_KEY = os.getenv("API_KEY")

# MongoDB URI
mongo_uri = os.getenv("MONGO_URI")

def connect_mongo():
    try:
        client = MongoClient(mongo_uri)
        client.admin.command('ping')
        print("MongoDB connection successful.")
        return client
    except ConnectionFailure as e:
        print(f"Could not connect to MongoDB: {e}")
        return None

In [None]:


def upload_file(pdf_path):
    url = "https://api.pdf.co/v1/file/upload"
    headers = {'x-api-key': API_KEY}
    files = {'file': open(pdf_path, 'rb')}
    response = requests.post(url, headers=headers, files=files)
    if response.status_code == 200:
        data = response.json()
        if not data['error']:
            return data['url']
        else:
            print(f"Error uploading file: {data['message']}")
            return None
    else:
        print(f"Error uploading file: {response.status_code} {response.reason}")
        return None

def get_pdf_info(pdf_url):
    url = "https://api.pdf.co/v1/pdf/info"
    headers = {'x-api-key': API_KEY}
    params = {'url': pdf_url}
    response = requests.post(url, headers=headers, data=params)
    if response.status_code == 200:
        data = response.json()
        if not data['error']:
            info_json = data['info']
            return info_json
        else:
            print(f"Error getting PDF info: {data['message']}")
            return None
    else:
        print(f"Error getting PDF info: {response.status_code} {response.reason}")
        return None

def extract_tables_from_pdf_url(pdf_url, pages):
    url = "https://api.pdf.co/v1/pdf/convert/to/csv"
    headers = {'x-api-key': API_KEY}
    params = {
        'url': pdf_url,
        'pages': pages,
        'inline': 'true',
        'isOCR': 'true',           # Enable OCR for better table extraction
        'ocrLanguages': 'eng',     # Specify OCR language
        'unwrap': 'true',          # Unwrap lines for better CSV format
    }
    response = requests.post(url, headers=headers, data=params)
    if response.status_code == 200:
        data = response.json()
        if not data['error']:
            return data['body']
        else:
            print(f"Error extracting tables: {data['message']}")
            return None
    else:
        print(f"Error extracting tables: {response.status_code} {response.reason}")
        return None

def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        extracted_text = {}
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            extracted_text[page_num + 1] = page.get_text("text")
        doc.close()
        return extracted_text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

In [None]:


def extract_data_from_pdf(pdf_path, output_folder, mongo_collection):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # Check if the PDF already exists in MongoDB
    if mongo_collection.find_one({"pdf_name": pdf_name}):
        print(f"PDF {pdf_name} already exists in MongoDB. Skipping processing.")
        return

    pdf_output_folder = os.path.join(output_folder, pdf_name)
    os.makedirs(pdf_output_folder, exist_ok=True)

    # Upload the PDF and get its URL
    pdf_url = upload_file(pdf_path)
    if not pdf_url:
        print(f"Failed to upload PDF {pdf_path}")
        return

    # Get PDF info to get the number of pages
    pdf_info = get_pdf_info(pdf_url)
    if not pdf_info:
        print(f"Failed to get info for PDF {pdf_path}")
        return

    # Extract the page count correctly
    num_pages = int(pdf_info.get('PageCount', pdf_info.get('pageCount', 0)))
    if num_pages == 0:
        print(f"Could not determine the number of pages for {pdf_name}")
        return

    # Extract text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)
    if extracted_text:
        print(f"Extracted text for {pdf_name}.")
    else:
        print(f"No text found for {pdf_name}.")
        extracted_text = {}

    all_pages_data = []

    for page_num in range(1, num_pages + 1):
        pages_str = str(page_num)
        print(f"Processing page {page_num} of {num_pages}...")

        # Extract tables
        table_csv = extract_tables_from_pdf_url(pdf_url, pages_str)
        table_filenames = []
        if table_csv:
            csv_filename = os.path.join(pdf_output_folder, f'table_page_{page_num}.csv')
            with open(csv_filename, 'w', encoding='utf-8') as f:
                f.write(table_csv)
            relative_csv_filename = os.path.relpath(csv_filename, output_folder)
            table_filenames.append(relative_csv_filename)
            print(f"Extracted table for page {page_num}.")
        else:
            print(f"No tables found on page {page_num}")

        page_data = {
            "page_number": page_num,
            "text": extracted_text.get(page_num, ""),  # Get text for this page
            "tables": table_filenames
        }
        all_pages_data.append(page_data)

    # Prepare data to be inserted into MongoDB
    pdf_data = {
        "pdf_name": pdf_name,
        "pages": all_pages_data
    }

    mongo_collection.insert_one(pdf_data)
    print(f"Inserted PDF {pdf_name} into MongoDB.")

In [16]:
def process_pdfs_in_folder(pdf_folder, output_folder, mongo_collection):
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing {filename}...")
            extract_data_from_pdf(pdf_path, output_folder, mongo_collection)

if __name__ == "__main__":
    client = connect_mongo()
    if client:
        db = client["pdf_database_test"]
        collection = db["pdf_collection_pdfco_15"]

        pdf_folder = "../extracted2"
        output_folder = "./extracted_data_pdf_co_15/"

        process_pdfs_in_folder(pdf_folder, output_folder, collection)


MongoDB connection successful.
Processing e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf...
Extracted text for e9a2c537-8232-4c3f-85b0-b52de6bcba99.
Processing page 1 of 3...
Extracted table for page 1.
Processing page 2 of 3...
Extracted table for page 2.
Processing page 3 of 3...
Error extracting tables: 450 CUSTOM
No tables found on page 3
Inserted PDF e9a2c537-8232-4c3f-85b0-b52de6bcba99 into MongoDB.
Processing 634fca59-03b2-4cdf-9ce4-0205df22f256.pdf...
Extracted text for 634fca59-03b2-4cdf-9ce4-0205df22f256.
Processing page 1 of 1...
Error extracting tables: 450 CUSTOM
No tables found on page 1
Inserted PDF 634fca59-03b2-4cdf-9ce4-0205df22f256 into MongoDB.
Processing Job Listing.pdf...
Extracted text for Job Listing.
Processing page 1 of 1...
Error extracting tables: 450 CUSTOM
No tables found on page 1
Inserted PDF Job Listing into MongoDB.
Processing 8f697523-6988-4c4f-8d72-760a45681f68.pdf...
Extracted text for 8f697523-6988-4c4f-8d72-760a45681f68.
Processing page 1 of 30...
Extra