In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import os

# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment variable

open_api_key = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI client

openai_client = OpenAI(api_key = open_api_key)



In [5]:
from modules.pdf_extraction import select_pdf_file

from modules.pdf_extraction import get_validated_table_info
from modules.pdf_extraction import get_page_pixel_data
from modules.pdf_extraction import process_tables_to_df
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import asyncio
import os
import pymupdf
import time



pdf_path = select_pdf_file()
doc = pymupdf.open(pdf_path)
total_pages = doc.page_count  # total number of pages in the document
page_indices = range(total_pages) # page_indices = range(1562,1567)
page_no = 0

page = doc.load_page(page_no)
extracted_text = page.get_text()

In [7]:

# extracted_text = extract_text_from_pages(pdf_path, pages=page_no)
base64_image = get_page_pixel_data(pdf_path=pdf_path, page_no=page_no, 
                    dpi = 500, image_type = 'png')

num_tables, table_headers, table_location, confidence_score_0 = await get_validated_table_info(
    text_input=extracted_text, 
    open_api_key=open_api_key, 
    base64_image=base64_image
)



In [18]:
from modules.pdf_extraction import vision_llm_parser
max_retries = 1
model = 'gpt-4o'
user_text = 'Extract all data from the table(s).'
for attempt in range(max_retries):
    tasks = []
    async with asyncio.TaskGroup() as tg:
        for table in table_headers:
            tasks.append(tg.create_task(
                vision_llm_parser(
                    user_text=user_text,
                    text_input=extracted_text,
                    table_to_target=table,
                    base64_image=base64_image,
                    open_api_key=open_api_key,
                    model= model
                )
            ))
    model_results = [task.result() for task in tasks]
    results_output = model_results

In [17]:
dd = []
dd = results_output
dd

['Based on the user\'s request and the provided table header:\n\n### Dictionary Output:\n\n```python\n[\n    {"0929": "HSA 2014WA 08", "Size": "1.1/2"},\n    {"0929": "HSA 2014WA 09", "Size": "2"},\n    {"0929": "HSA 2014WA 10", "Size": "2.1/2"},\n    {"0929": "HSA 2014WA 11", "Size": "3"},\n    {"0929": "HSA 2014WA 12", "Size": "4"},\n    {"0929": "HSA 2014WA 13", "Size": "5"},\n    {"0929": "HSA 2014WA 14", "Size": "6"},\n    {"0929": "HSA 2014WA 16", "Size": "8"},\n    {"0929": "HSA 2014WA 18", "Size": "10"},\n    {"0929": "HSA 2014WA 20", "Size": "12"}\n]\n```\n\nThis output is extracted from the specified table in the top-left position and formatted into a dictionary suitable for conversion to a DataFrame.',
 'Sure, here is the extracted data in dictionary format:\n\n```python\n[\n    {"0929": "HDA 2014WA 08", "Size": "1.1/2"},\n    {"0929": "HDA 2014WA 09", "Size": "2"},\n    {"0929": "HDA 2014WA 10", "Size": "2.1/2"},\n    {"0929": "HDA 2014WA 11", "Size": "3"},\n    {"0929": "H

In [19]:
from modules.pdf_extraction import extract_df_from_string

df_list = []
max_extract_retries_for_extraction_failures = 1
page_number = 7

for i, out in enumerate(results_output):
    extract_retry_count = 0
    max_extract_retries = max_extract_retries_for_extraction_failures  # Maximum number of retries for extraction failures
    
    while extract_retry_count <= max_extract_retries:
            df = extract_df_from_string(out)

            # Normalize columns
            df.columns = df.columns.astype(str).str.strip().str.strip('"\'').str.title()
            if table_location[i] == 'Table is present in both the image and the text document':
                df[df.columns] = df[df.columns].map(
                    lambda val: val if str(val) in extracted_text else "N/A"
                )
                df['table_header_position'] = table_headers[i]
            else:
                df['table_header_position'] = table_headers[i]
                df['page_number'] = page_number + 1

            df_list.append(df)
            break  # Successfully extracted, exit the retry loop

In [20]:
len(df_list)

8

In [3]:

import logging
import asyncio
async def process_tables_to_df(
    table_headers, 
    table_location, 
    user_text, 
    extracted_text, 
    base64_image, 
    open_api_key, 
    page_number,
    max_retries=3,
    initial_delay=1,
    backoff_factor=2,
    max_extract_retries_for_extraction_failures=2,
    model='gpt-4o'
):
    """
    Process tables by calling an LLM parser with exponential backoff.
    """
    logging.info(f"Processing tables to DataFrame for page {page_number + 1}")
    

    # 1) Try first model: 'gpt-4o'
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            logging.debug(f"[Model {model}] Attempt {attempt+1} of {max_retries}. Delay={delay}")
            tasks = []
            async with asyncio.TaskGroup() as tg:
                for table in table_headers:
                    tasks.append(tg.create_task(
                        vision_llm_parser(
                            user_text=user_text,
                            text_input=extracted_text,
                            table_to_target=table,
                            base64_image=base64_image,
                            open_api_key=open_api_key,
                            model= model
                        )
                    ))
            results_output = [task.result() for task in tasks]

            logging.info(f"Successfully retrieved data using model '{model}'.")
            break
        except Exception as e:
            logging.warning(
                f"[Model {model}] Attempt {attempt+1} of {max_retries} failed: {e}. "
                f"Retrying in {delay} second(s)..."
            )
            if attempt == max_retries - 1:
                logging.warning(f"Max retries with '{model}' exhausted.")
            else:
                await asyncio.sleep(delay)
                delay *= backoff_factor

    # 2) Process the results into DataFrames
    df_list = []
    for i, out in enumerate(results_output):
        extract_retry_count = 0
        max_extract_retries = max_extract_retries_for_extraction_failures  # Maximum number of retries for extraction failures
        
        while extract_retry_count <= max_extract_retries:

                df = extract_df_from_string(out)
                logging.debug(f"Parsed DataFrame for table index {i} with shape {df.shape}")

                # Normalize columns
                df.columns = df.columns.astype(str).str.strip().str.strip('"\'').str.title()
                if table_location[i] == 'Table is present in both the image and the text document':
                    df[df.columns] = df[df.columns].map(
                        lambda val: val if str(val) in extracted_text else "N/A"
                    )
                    df['table_header_position'] = table_headers[i]
                else:
                    df['table_header_position'] = table_headers[i]
                    df['page_number'] = page_number + 1

                df_list.append(df)


    return df_list

In [19]:
import logging
import re
def extract_table_info(text):
    """
    Extracts table information from a pattern description string.
    
    Parameters:
        text (str): Text containing table pattern description
        
    Returns:
        tuple: (num_tables, table_headers)
    """
    logging.debug("Extracting table info from text.")

    match_num_tables = re.search(r'Number of Tables on the Page:\s*(\d+)', text)
    if match_num_tables:
        num_tables = int(match_num_tables.group(1))
        logging.debug(f"Found number of tables: {num_tables}")
    else:
        num_tables = None
        logging.debug("No table count found in text.")

    # Modified regex to capture table headers without requiring "3." after
    match_headers = re.search(r'Table Headers:\s*(.*?)(?:\s*\n\s*3\.|$)', text, re.DOTALL)
    if match_headers:
        headers_text = match_headers.group(1).strip()
        # Remove any extra quotes and whitespace
        table_headers = [h.strip().strip('"') for h in headers_text.split('||')]
        logging.debug(f"Extracted table headers: {table_headers}")
    else:
        table_headers = []
        logging.debug("No table headers found.")

    return num_tables, table_headers

out = '1. Number of Tables on the Page: 3 2. Table Headers: "1 Wire, 40 Metre Coils - Can be found on the top right position of the page" || "1 Wire, 100 Metre Coils - Can be found in the middle left position of the page" || "Hydraulic Hose, EN853, DIN20022, SAE100R1AT Technical Data - Can be found on the bottom of the page"'

In [20]:
extract_table_info(out)

(3,
 ['1 Wire, 40 Metre Coils - Can be found on the top right position of the page',
  '1 Wire, 100 Metre Coils - Can be found in the middle left position of the page',
  'Hydraulic Hose, EN853, DIN20022, SAE100R1AT Technical Data - Can be found on the bottom of the page'])

In [4]:
from modules.pdf_extraction import select_pdf_file
from modules.pdf_extraction import get_validated_table_info
from modules.pdf_extraction import get_page_pixel_data
from modules.pdf_extraction import process_tables_to_df
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import asyncio
import os
import pymupdf



file_name = 'test_7'    
user_text='Extract all data from the table(s).'

# 1. Load Credientials

# Load environment variables from the .env file
load_dotenv()
# Get the API key from the environment variable
open_api_key = os.getenv('OPENAI_API_KEY')
# Initialize OpenAI client
openai_client = OpenAI(api_key = open_api_key)





# 2. Select PDF file and extract text
pdf_path = select_pdf_file()
doc = pymupdf.open(pdf_path)
total_pages = doc.page_count  # total number of pages in the document
page_indices = range(total_pages) # page_indices = range(1562,1567)

# page_indices can be a list of page numbers to process


# Start timing
start_time = time.time()

async def process_page():
    tasks = []
    results_output_page = []
    # Create all tasks first 
    async with asyncio.TaskGroup() as tg:
        for page_no in page_indices:
            page = doc.load_page(page_no)
            
            tabs = page.find_tables()
            num_tables_0 = len(tabs.tables)
            
            # Check for the presence of tables with pymupdf. This will mean images with tables will be ignored. 
            if num_tables_0 == 0:
                print(f"No tables found on page from pymupdf {page_no + 1}, skipping...")
                continue

            extracted_text = page.get_text()
            
            # extracted_text = extract_text_from_pages(pdf_path, pages=page_no)
            base64_image = get_page_pixel_data(pdf_path=pdf_path, page_no=page_no, 
                                dpi = 500, image_type = 'png')
        
            num_tables, table_headers, table_location, confidence_score_0 = await get_validated_table_info(
                text_input=extracted_text, 
                open_api_key=open_api_key, 
                base64_image=base64_image
            )

            if num_tables == 0:
                print(f"No tables found on page by LLM {page_no + 1}, skipping...")
                continue
    
            tasks.append(tg.create_task(process_tables_to_df(
                table_headers, 
                table_location,
                user_text, 
                extracted_text, 
                base64_image, 
                open_api_key,
                page_number=page_no)))
            
        # Await all tasks to complete
        for task in tasks:
            results_output_page.append(await task)
    
    if not results_output_page:
        raise ValueError("No tables found on any of the processed pages")
            
    # df_out_1 = pd.concat(results_output, ignore_index=True)
    return results_output_page


output_final = await process_page()




In [6]:
output_final

[[            0929   Size                              table_header_position  \
  0  HSA 2014WA 08  1.1/2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  1  HSA 2014WA 09      2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  2  HSA 2014WA 10  2.1/2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  3  HSA 2014WA 11      3  - "Single Acting, Ductile Iron Body, EPDM Line...   
  4  HSA 2014WA 12      4  - "Single Acting, Ductile Iron Body, EPDM Line...   
  5  HSA 2014WA 13      5  - "Single Acting, Ductile Iron Body, EPDM Line...   
  6  HSA 2014WA 14      6  - "Single Acting, Ductile Iron Body, EPDM Line...   
  7  HSA 2014WA 16      8  - "Single Acting, Ductile Iron Body, EPDM Line...   
  8  HSA 2014WA 18     10  - "Single Acting, Ductile Iron Body, EPDM Line...   
  9  HSA 2014WA 20     12  - "Single Acting, Ductile Iron Body, EPDM Line...   
  
     page_number  
  0            1  
  1            1  
  2            1  
  3            1  
  4            1  
  5

In [22]:
from modules.pdf_extraction import write_output_final
write_output_final(output_final, excel_path='files/Tester_2.xlsx', option=1, gap_rows=2)

2025-03-01 21:28:25,297 - INFO - Writing output to Excel at 'files/Tester_2.xlsx' with option=1.
2025-03-01 21:28:25,373 - INFO - Excel file writing complete.


In [4]:
import pymupdf

from modules.pdf_extraction import select_pdf_file

pdf_path = select_pdf_file()

# Open some document, for example a PDF (could also be EPUB, XPS, etc.)
doc = pymupdf.open(pdf_path)

# Load a desired page. This works via 0-based numbers
page = doc[0]  # this is the first page

# Look for tables on this page and display the table count
tabs = page.find_tables()
num_tables = len(tabs.tables)
num_tables

0

In [21]:
# import os
# from PyPDF2 import PdfReader, PdfWriter


# from modules.pdf_extraction import select_pdf_file
# from modules.pdf_extraction import extract_text_from_pages

# pdf_path = select_pdf_file()

# def split_pdf(input_path, output_path_1, output_path_2, split_page):
#     """
#     Split a PDF file into two separate PDF files.
    
#     Args:
#         input_path (str): Path to the input PDF file
#         output_path_1 (str): Path where to save the first part
#         output_path_2 (str): Path where to save the second part
#         split_page (int): The page number where to split (this page will be the first page of the second PDF)
#     """
#     try:
#         # Create PDF reader object
#         reader = PdfReader(input_path)
        
#         # Get total number of pages
#         total_pages = len(reader.pages)
        
#         if split_page >= total_pages:
#             raise ValueError("Split page number cannot be greater than total pages")
        
#         # Create two PDF writer objects
#         writer1 = PdfWriter()
#         writer2 = PdfWriter()
        
#         # Add pages to first output PDF (before split point)
#         for page in range(split_page):
#             writer1.add_page(reader.pages[page])
            
#         # Add pages to second output PDF (from split point to end)
#         for page in range(split_page, total_pages):
#             writer2.add_page(reader.pages[page])
            
#         # Save the first part
#         with open(output_path_1, 'wb') as output1:
#             writer1.write(output1)
            
#         # Save the second part
#         with open(output_path_2, 'wb') as output2:
#             writer2.write(output2)
            
#         return True
    
#     except Exception as e:
#         print(f"An error occurred: {str(e)}")
#         return False

# def main():
#     # Example usage
#     input_file = pdf_path  # Replace with your PDF file
#     output_file1 = "part1.pdf"
#     output_file2 = "part2.pdf"
#     split_at_page = 1000   # Split after first page (0-based index)
    
#     if os.path.exists(input_file):
#         success = split_pdf(input_file, output_file1, output_file2, split_at_page)
#         if success:
#             print(f"PDF split successfully!")
#             print(f"First part saved as: {output_file1}")
#             print(f"Second part saved as: {output_file2}")
#         else:
#             print("Failed to split PDF.")
#     else:
#         print(f"Input file '{input_file}' not found.")
        
# main()

