In [1]:
import pdfplumber
import json
import os

def extract_table_coordinates(pdf_path):
    """Extracts table coordinates from a PDF file."""
    tables_info = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                # Get table boundaries
                x1, y1, x2, y2 = page.bbox
                rows = len(table)
                columns = len(table[0]) if rows > 0 else 0

                # Append the table information
                tables_info.append({
                    "coordinates": {
                        "x1": x1,
                        "y1": y1,
                        "x2": x2,
                        "y2": y2
                    },
                    "rows": rows,
                    "columns": columns
                })
    return tables_info

def process_invoices(folder_path):
    """Processes all PDF invoices in the given folder."""
    all_tables = {}
    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, pdf_file)
            print(f"Extracting tables from {pdf_path}...")
            tables = extract_table_coordinates(pdf_path)
            all_tables[pdf_file] = tables
    
    return all_tables

# Specify the path to your input folder containing PDF files
input_folder_path = 'C:/Users/91808/Desktop/Task2/Input'  # Update with your path

# Process all invoices and save the output
tables_data = process_invoices(input_folder_path)

# Save extracted table coordinates to a JSON file
output_json_path = os.path.join(input_folder_path, 'table_coordinates.json')
with open(output_json_path, 'w') as json_file:
    json.dump(tables_data, json_file, indent=4)

print(f"Table coordinates extracted and saved to {output_json_path}.")


Extracting tables from C:/Users/91808/Desktop/Task2/Input\1.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\2.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\3.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\4.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\5.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\6.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\7.pdf...
Extracting tables from C:/Users/91808/Desktop/Task2/Input\8.pdf...
Table coordinates extracted and saved to C:/Users/91808/Desktop/Task2/Input\table_coordinates.json.


In [6]:
invoice_folder=r"C:\Users\91808\Desktop\Task2\Input"