In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import os

# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment variable

open_api_key = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI client

openai_client = OpenAI(api_key = open_api_key)



In [5]:
from modules.pdf_extraction import select_pdf_file

from modules.pdf_extraction import get_validated_table_info
from modules.pdf_extraction import get_page_pixel_data
from modules.pdf_extraction import process_tables_to_df
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import asyncio
import os
import pymupdf
import time



pdf_path = select_pdf_file()
doc = pymupdf.open(pdf_path)
total_pages = doc.page_count  # total number of pages in the document
page_indices = range(total_pages) # page_indices = range(1562,1567)
page_no = 0

page = doc.load_page(page_no)
extracted_text = page.get_text()

In [7]:

# extracted_text = extract_text_from_pages(pdf_path, pages=page_no)
base64_image = get_page_pixel_data(pdf_path=pdf_path, page_no=page_no, 
                    dpi = 500, image_type = 'png')

num_tables, table_headers, table_location, confidence_score_0 = await get_validated_table_info(
    text_input=extracted_text, 
    open_api_key=open_api_key, 
    base64_image=base64_image
)



In [18]:
from modules.pdf_extraction import vision_llm_parser
max_retries = 1
model = 'gpt-4o'
user_text = 'Extract all data from the table(s).'
for attempt in range(max_retries):
    tasks = []
    async with asyncio.TaskGroup() as tg:
        for table in table_headers:
            tasks.append(tg.create_task(
                vision_llm_parser(
                    user_text=user_text,
                    text_input=extracted_text,
                    table_to_target=table,
                    base64_image=base64_image,
                    open_api_key=open_api_key,
                    model= model
                )
            ))
    model_results = [task.result() for task in tasks]
    results_output = model_results

In [17]:
dd = []
dd = results_output
dd

['Based on the user\'s request and the provided table header:\n\n### Dictionary Output:\n\n```python\n[\n    {"0929": "HSA 2014WA 08", "Size": "1.1/2"},\n    {"0929": "HSA 2014WA 09", "Size": "2"},\n    {"0929": "HSA 2014WA 10", "Size": "2.1/2"},\n    {"0929": "HSA 2014WA 11", "Size": "3"},\n    {"0929": "HSA 2014WA 12", "Size": "4"},\n    {"0929": "HSA 2014WA 13", "Size": "5"},\n    {"0929": "HSA 2014WA 14", "Size": "6"},\n    {"0929": "HSA 2014WA 16", "Size": "8"},\n    {"0929": "HSA 2014WA 18", "Size": "10"},\n    {"0929": "HSA 2014WA 20", "Size": "12"}\n]\n```\n\nThis output is extracted from the specified table in the top-left position and formatted into a dictionary suitable for conversion to a DataFrame.',
 'Sure, here is the extracted data in dictionary format:\n\n```python\n[\n    {"0929": "HDA 2014WA 08", "Size": "1.1/2"},\n    {"0929": "HDA 2014WA 09", "Size": "2"},\n    {"0929": "HDA 2014WA 10", "Size": "2.1/2"},\n    {"0929": "HDA 2014WA 11", "Size": "3"},\n    {"0929": "H

In [19]:
from modules.pdf_extraction import extract_df_from_string

df_list = []
max_extract_retries_for_extraction_failures = 1
page_number = 7

for i, out in enumerate(results_output):
    extract_retry_count = 0
    max_extract_retries = max_extract_retries_for_extraction_failures  # Maximum number of retries for extraction failures
    
    while extract_retry_count <= max_extract_retries:
            df = extract_df_from_string(out)

            # Normalize columns
            df.columns = df.columns.astype(str).str.strip().str.strip('"\'').str.title()
            if table_location[i] == 'Table is present in both the image and the text document':
                df[df.columns] = df[df.columns].map(
                    lambda val: val if str(val) in extracted_text else "N/A"
                )
                df['table_header_position'] = table_headers[i]
            else:
                df['table_header_position'] = table_headers[i]
                df['page_number'] = page_number + 1

            df_list.append(df)
            break  # Successfully extracted, exit the retry loop

In [20]:
len(df_list)

8

In [3]:

import logging
import asyncio
async def process_tables_to_df(
    table_headers, 
    table_location, 
    user_text, 
    extracted_text, 
    base64_image, 
    open_api_key, 
    page_number,
    max_retries=3,
    initial_delay=1,
    backoff_factor=2,
    max_extract_retries_for_extraction_failures=2,
    model='gpt-4o'
):
    """
    Process tables by calling an LLM parser with exponential backoff.
    """
    logging.info(f"Processing tables to DataFrame for page {page_number + 1}")
    

    # 1) Try first model: 'gpt-4o'
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            logging.debug(f"[Model {model}] Attempt {attempt+1} of {max_retries}. Delay={delay}")
            tasks = []
            async with asyncio.TaskGroup() as tg:
                for table in table_headers:
                    tasks.append(tg.create_task(
                        vision_llm_parser(
                            user_text=user_text,
                            text_input=extracted_text,
                            table_to_target=table,
                            base64_image=base64_image,
                            open_api_key=open_api_key,
                            model= model
                        )
                    ))
            results_output = [task.result() for task in tasks]

            logging.info(f"Successfully retrieved data using model '{model}'.")
            break
        except Exception as e:
            logging.warning(
                f"[Model {model}] Attempt {attempt+1} of {max_retries} failed: {e}. "
                f"Retrying in {delay} second(s)..."
            )
            if attempt == max_retries - 1:
                logging.warning(f"Max retries with '{model}' exhausted.")
            else:
                await asyncio.sleep(delay)
                delay *= backoff_factor

    # 2) Process the results into DataFrames
    df_list = []
    for i, out in enumerate(results_output):
        extract_retry_count = 0
        max_extract_retries = max_extract_retries_for_extraction_failures  # Maximum number of retries for extraction failures
        
        while extract_retry_count <= max_extract_retries:

                df = extract_df_from_string(out)
                logging.debug(f"Parsed DataFrame for table index {i} with shape {df.shape}")

                # Normalize columns
                df.columns = df.columns.astype(str).str.strip().str.strip('"\'').str.title()
                if table_location[i] == 'Table is present in both the image and the text document':
                    df[df.columns] = df[df.columns].map(
                        lambda val: val if str(val) in extracted_text else "N/A"
                    )
                    df['table_header_position'] = table_headers[i]
                else:
                    df['table_header_position'] = table_headers[i]
                    df['page_number'] = page_number + 1

                df_list.append(df)


    return df_list

In [19]:
import logging
import re
def extract_table_info(text):
    """
    Extracts table information from a pattern description string.
    
    Parameters:
        text (str): Text containing table pattern description
        
    Returns:
        tuple: (num_tables, table_headers)
    """
    logging.debug("Extracting table info from text.")

    match_num_tables = re.search(r'Number of Tables on the Page:\s*(\d+)', text)
    if match_num_tables:
        num_tables = int(match_num_tables.group(1))
        logging.debug(f"Found number of tables: {num_tables}")
    else:
        num_tables = None
        logging.debug("No table count found in text.")

    # Modified regex to capture table headers without requiring "3." after
    match_headers = re.search(r'Table Headers:\s*(.*?)(?:\s*\n\s*3\.|$)', text, re.DOTALL)
    if match_headers:
        headers_text = match_headers.group(1).strip()
        # Remove any extra quotes and whitespace
        table_headers = [h.strip().strip('"') for h in headers_text.split('||')]
        logging.debug(f"Extracted table headers: {table_headers}")
    else:
        table_headers = []
        logging.debug("No table headers found.")

    return num_tables, table_headers

out = '1. Number of Tables on the Page: 3 2. Table Headers: "1 Wire, 40 Metre Coils - Can be found on the top right position of the page" || "1 Wire, 100 Metre Coils - Can be found in the middle left position of the page" || "Hydraulic Hose, EN853, DIN20022, SAE100R1AT Technical Data - Can be found on the bottom of the page"'

In [20]:
extract_table_info(out)

(3,
 ['1 Wire, 40 Metre Coils - Can be found on the top right position of the page',
  '1 Wire, 100 Metre Coils - Can be found in the middle left position of the page',
  'Hydraulic Hose, EN853, DIN20022, SAE100R1AT Technical Data - Can be found on the bottom of the page'])

In [4]:
from modules.pdf_extraction import select_pdf_file
from modules.pdf_extraction import get_validated_table_info
from modules.pdf_extraction import get_page_pixel_data
from modules.pdf_extraction import process_tables_to_df
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import asyncio
import os
import pymupdf



file_name = 'test_7'    
user_text='Extract all data from the table(s).'

# 1. Load Credientials

# Load environment variables from the .env file
load_dotenv()
# Get the API key from the environment variable
open_api_key = os.getenv('OPENAI_API_KEY')
# Initialize OpenAI client
openai_client = OpenAI(api_key = open_api_key)





# 2. Select PDF file and extract text
pdf_path = select_pdf_file()
doc = pymupdf.open(pdf_path)
total_pages = doc.page_count  # total number of pages in the document
page_indices = range(total_pages) # page_indices = range(1562,1567)

# page_indices can be a list of page numbers to process


# Start timing
start_time = time.time()

async def process_page():
    tasks = []
    results_output_page = []
    # Create all tasks first 
    async with asyncio.TaskGroup() as tg:
        for page_no in page_indices:
            page = doc.load_page(page_no)
            
            tabs = page.find_tables()
            num_tables_0 = len(tabs.tables)
            
            # Check for the presence of tables with pymupdf. This will mean images with tables will be ignored. 
            if num_tables_0 == 0:
                print(f"No tables found on page from pymupdf {page_no + 1}, skipping...")
                continue

            extracted_text = page.get_text()
            
            # extracted_text = extract_text_from_pages(pdf_path, pages=page_no)
            base64_image = get_page_pixel_data(pdf_path=pdf_path, page_no=page_no, 
                                dpi = 500, image_type = 'png')
        
            num_tables, table_headers, table_location, confidence_score_0 = await get_validated_table_info(
                text_input=extracted_text, 
                open_api_key=open_api_key, 
                base64_image=base64_image
            )

            if num_tables == 0:
                print(f"No tables found on page by LLM {page_no + 1}, skipping...")
                continue
    
            tasks.append(tg.create_task(process_tables_to_df(
                table_headers, 
                table_location,
                user_text, 
                extracted_text, 
                base64_image, 
                open_api_key,
                page_number=page_no)))
            
        # Await all tasks to complete
        for task in tasks:
            results_output_page.append(await task)
    
    if not results_output_page:
        raise ValueError("No tables found on any of the processed pages")
            
    # df_out_1 = pd.concat(results_output, ignore_index=True)
    return results_output_page


output_final = await process_page()




In [6]:
output_final

[[            0929   Size                              table_header_position  \
  0  HSA 2014WA 08  1.1/2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  1  HSA 2014WA 09      2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  2  HSA 2014WA 10  2.1/2  - "Single Acting, Ductile Iron Body, EPDM Line...   
  3  HSA 2014WA 11      3  - "Single Acting, Ductile Iron Body, EPDM Line...   
  4  HSA 2014WA 12      4  - "Single Acting, Ductile Iron Body, EPDM Line...   
  5  HSA 2014WA 13      5  - "Single Acting, Ductile Iron Body, EPDM Line...   
  6  HSA 2014WA 14      6  - "Single Acting, Ductile Iron Body, EPDM Line...   
  7  HSA 2014WA 16      8  - "Single Acting, Ductile Iron Body, EPDM Line...   
  8  HSA 2014WA 18     10  - "Single Acting, Ductile Iron Body, EPDM Line...   
  9  HSA 2014WA 20     12  - "Single Acting, Ductile Iron Body, EPDM Line...   
  
     page_number  
  0            1  
  1            1  
  2            1  
  3            1  
  4            1  
  5

In [22]:
from modules.pdf_extraction import write_output_final
write_output_final(output_final, excel_path='files/Tester_2.xlsx', option=1, gap_rows=2)

2025-03-01 21:28:25,297 - INFO - Writing output to Excel at 'files/Tester_2.xlsx' with option=1.
2025-03-01 21:28:25,373 - INFO - Excel file writing complete.


In [1]:
from modules.pdf_extraction import extract_text_from_pages
from modules.pdf_extraction import select_pdf_file

pdf_path = select_pdf_file()


extract_text_from_pages(pdf_input=pdf_path)

Processing C:/Users/derri/Downloads/ilovepdf_extracted-pages/Updated-Nutritional-Guidelines-58.pdf...


'# Table VII-11. Recommended nutrient levels for complete food for dogs and cats Units per kg metabolic bodyweight (dogs kg BW[0.75], cats kg BW[0.67])\n\n|Nutrient|UNIT|Minimum Recommended Nutrient Levels per kg metabolic BW (dogs kg BW0.75; cats kg BW0.67)|Col4|\n|---|---|---|---|\n|||Adult Dog Maintenance|Adult Cat Maintenance|\n|Protein*|g|4.95|6.25|\n|Arginine*|g|0.14|0.25|\n|Histidine|g|0.06|0.08|\n|Isoleucine|g|0.13|0.12|\n|Leucine|g|0.23|0.29|\n|Lysine*|g|0.12|0.09|\n|Methionine*|g|0.11|0.04|\n|Methionine + Cystine*|g|0.21|0.09|\n|Phenylalanine|g|0.15|0.12|\n|Phenylalanine + Tyrosine*|g|0.24|0.44|\n|Threonine|g|0.14|0.15|\n|Tryptophan|g|0.05|0.04|\n|Valine|g|0.16|0.15|\n|Taurine (canned pet food)*|g||0.05|\n|Taurine (dry pet food)*|g||0.03|\n|Fat*|g|1.51|2.25|\n|Linoleic acid (ω-6)*|g|0.36|0.13|\n|Arachidonic acid (ω-6)|mg|-|1.50|\n|Alpha-linolenic acid (ω-3)*|g|-|-|\n|EPA + DHA (ω-3)*|g|-|-|\n|Minerals||||\n|Calcium|g|0.14a,b|0.10|\n|Phosphorus|g|0.11h|0.06f,g|\n|Potassium|g|0

In [34]:
import pymupdf4llm
from modules.pdf_extraction import select_pdf_file

pdf_path = select_pdf_file()

md_text = pymupdf4llm.to_markdown(pdf_path)
md_text


Processing C:/Users/derri/Downloads/ilovepdf_extracted-pages/Updated-Nutritional-Guidelines-58.pdf...


'# Table VII-11. Recommended nutrient levels for complete food for dogs and cats Units per kg metabolic bodyweight (dogs kg BW[0.75], cats kg BW[0.67])\n\n|Nutrient|UNIT|Minimum Recommended Nutrient Levels per kg metabolic BW (dogs kg BW0.75; cats kg BW0.67)|Col4|\n|---|---|---|---|\n|||Adult Dog Maintenance|Adult Cat Maintenance|\n|Protein*|g|4.95|6.25|\n|Arginine*|g|0.14|0.25|\n|Histidine|g|0.06|0.08|\n|Isoleucine|g|0.13|0.12|\n|Leucine|g|0.23|0.29|\n|Lysine*|g|0.12|0.09|\n|Methionine*|g|0.11|0.04|\n|Methionine + Cystine*|g|0.21|0.09|\n|Phenylalanine|g|0.15|0.12|\n|Phenylalanine + Tyrosine*|g|0.24|0.44|\n|Threonine|g|0.14|0.15|\n|Tryptophan|g|0.05|0.04|\n|Valine|g|0.16|0.15|\n|Taurine (canned pet food)*|g||0.05|\n|Taurine (dry pet food)*|g||0.03|\n|Fat*|g|1.51|2.25|\n|Linoleic acid (ω-6)*|g|0.36|0.13|\n|Arachidonic acid (ω-6)|mg|-|1.50|\n|Alpha-linolenic acid (ω-3)*|g|-|-|\n|EPA + DHA (ω-3)*|g|-|-|\n|Minerals||||\n|Calcium|g|0.14a,b|0.10|\n|Phosphorus|g|0.11h|0.06f,g|\n|Potassium|g|0

In [27]:
md_text

'## Tables VII-8a provides equations for growth curves (GfE 1989, Meyer H and Zentek J 1992), valid from weaning age (8 weeks) to 1 year. Although it is appreciated that growth patterns show a certain individual variability, we consider the recommended growth curves to be a powerful tool to generate an estimate of the optimal actual body weight throughout the growth period. This is a critical parameter to predict the ME requirement in puppies and subsequently needed to create feeding guides. However, regular weighing of puppies, based on the veterinarian`s recommendation and individual correction for\n\n\n## food (i.e. energy) allocation to ensure growth along the curve is recommended for puppy owners.\n\n Tables VII-8b provides average energy requirements during growth and reproduction in dogs Energy requirements for lactation depend on the litter size. Table VII-8b provides equations to calculate the average energy needs of lactating bitches at different stages of lactation. For some

In [None]:
import pymupdf
pdf_path = select_pdf_file()

# Open some document, for example a PDF (could also be EPUB, XPS, etc.)
doc = pymupdf.open(pdf_path)
doc

Document('C:/Users/derri/Downloads/Updated-Nutritional-Guidelines-59.pdf')

In [6]:
doc[0].get_text()

' Page        59 of 98\nFEDIAF  Nutritional Guidelines  |  Publication October 2021\nTable VII-12.  \nImpact of energy requirement on nutrient intake and minimum \nrecommendations\n7.2.5.  Impact of energy requirement on product formulation\nBalanced nutrition ensuring adequate intakes of energy, \nprotein, minerals and vitamins is essential for cats and \ndogs to ensure health and longevity. In order to achieve \nthe recommended intake of energy and nutrients, \nproducts must be formulated to match these needs. The \nFEDIAF recommendations are principally based on NRC \nrecommendations (NRC 2006j) as well as on other peer \nreviewed science as referenced in the substantiation \ntables. Major differences between FEDIAF and NRC \nrecommendations for adult cats and dogs are driven by a \nsystematic adjustment applied to all essential nutrients \ndue to different assumptions on daily maintenance energy \nrequirements.\nThe NRC adult maintenance recommendations for dogs \nare based on an a

In [7]:
# Save markdown text to file
with open('output.md', 'w', encoding='utf-8') as f:
    f.write(md_text)

# Also display the markdown text
md_text

'# Table VII-12.  Impact of energy requirement on nutrient intake and minimum recommendations\n\n|Example: Impact of energy requirement on dry matter and nutrient intake|Col2|Col3|Col4|Col5|\n|---|---|---|---|---|\n||4 kg cat||15 kg dog||\n|MER|100 kcal/kg BW0.67|75 kcal/kg BW0.67|110 kcal/kg BW0.75|95 kcal/kg BW0.75|\n|Daily energy intake|253 kcal|189 kcal|838 kcal|724 kcal|\n|DM intake (400 kcal/100 g DM)|63 g|47 g|210 g|181 g|\n|Total daily Zn requirement|4.75 mg||15 mg||\n|Adequate Zn level|7.5 mg/100 g DM|10.0 mg/100 g DM|7.2 mg/100 g DM|8.34 mg/100 g DM|\n\n\nAdequate Zn level 7.5 mg/100 g DM 10.0 mg/100 g DM 7.2 mg/100 g DM 8.34 mg/100 g DM\n\n# 7.2.5. Impact of energy requirement on product formulation\n\nBalanced nutrition ensuring adequate intakes of energy, impact activity (Burger IH 1994, Connor MM et al. 2000, Kealy\nprotein, minerals and vitamins is essential for cats and _RD et al. 2002). Studies investigating the maintenance energy_\ndogs to ensure health and longevity.

In [1]:
import pymupdf

from modules.pdf_extraction import select_pdf_file

pdf_path = select_pdf_file()

# Open some document, for example a PDF (could also be EPUB, XPS, etc.)
doc = pymupdf.open(pdf_path)

# Load a desired page. This works via 0-based numbers
page = doc[0]  # this is the first page

# Look for tables on this page and display the table count
tabs = page.find_tables()
num_tables = len(tabs.tables)
num_tables

0

In [None]:

# def extract_text_from_pages(pdf_input, pages=None):
#     """
#     Extracts text from specified pages in a PDF file using pymupdf4llm's markdown generation.

#     Parameters:
#         pdf_input (str or file-like object): The path to the PDF file or a file-like object.
#         pages (int, list, tuple, or None): 
#             - If an integer, extracts text from that specific page (0-indexed).
#             - If a list of integers, extracts text from the specified pages.
#             - If a tuple of two integers, treats it as a range (start, end) and extracts from start (inclusive)
#               to end (exclusive).
#             - If None, extracts text from all pages.

#     Returns:
#         str: The markdown text extracted from the specified pages.
#     """
#     logging.info("Starting text extraction from PDF using pymupdf4llm markdown.")
#     logging.debug(f"Received pdf_input={pdf_input}, pages={pages}")
    
#     # For file-like objects, save to temporary file
#     if not isinstance(pdf_input, str):
#         import tempfile
#         with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
#             pdf_input.seek(0)
#             temp_file.write(pdf_input.read())
#             temp_path = temp_file.name
#         pdf_path = temp_path
#     else:
#         pdf_path = pdf_input
    
#     # Open the PDF to get total page count for validation
#     doc = pymupdf.open(pdf_path)
#     total_pages = doc.page_count
#     logging.debug(f"PDF has {total_pages} pages.")
    
#     # Determine which pages to extract
#     if pages is None:
#         # Use pymupdf4llm to extract all pages
#         markdown_text = pymupdf4llm.to_markdown(pdf_path)
#         doc.close()
#         return markdown_text
    
#     # For specific pages, we need to handle page selection
#     if isinstance(pages, int):
#         if pages < 0 or pages >= total_pages:
#             logging.error(f"Page index {pages} is out of range. Total pages: {total_pages}")
#             raise ValueError(f"Page index {pages} is out of range. Total pages: {total_pages}")
#         page_indices = [pages]
#     elif isinstance(pages, (list, tuple)):
#         if isinstance(pages, tuple) and len(pages) == 2:
#             start, end = pages
#             if not (isinstance(start, int) and isinstance(end, int)):
#                 logging.error("Start and end values must be integers.")
#                 raise ValueError("Start and end values must be integers.")
#             if start < 0 or end > total_pages or start >= end:
#                 logging.error("Invalid page range specified.")
#                 raise ValueError("Invalid page range specified.")
#             page_indices = range(start, end)
#         else:
#             page_indices = []
#             for p in pages:
#                 if not isinstance(p, int):
#                     logging.error("Page indices must be integers.")
#                     raise ValueError("Page indices must be integers.")
#                 if p < 0 or p >= total_pages:
#                     logging.error(f"Page index {p} is out of range. Total pages: {total_pages}")
#                     raise ValueError(f"Page index {p} is out of range. Total pages: {total_pages}")
#                 page_indices.append(p)
#     else:
#         logging.error("Parameter 'pages' must be an int, list, tuple, or None.")
#         raise ValueError("Parameter 'pages' must be an int, list, tuple, or None.")
    
#     doc.close()
    
#     # For selected pages, we'll use pymupdf4llm's page parameter if available
#     # If pymupdf4llm doesn't support page selection directly, we'll extract markdown for all
#     # pages and then filter the content
#     try:
#         # Try to use page parameter if available in pymupdf4llm
#         markdown_text = pymupdf4llm.to_markdown(pdf_path, pages=page_indices)
#     except TypeError:
#         # If pages parameter is not supported, extract all and then filter
#         all_markdown = pymupdf4llm.to_markdown(pdf_path)
        
#         # Split by page markers if they exist in the markdown output
#         # Note: This assumes pymupdf4llm adds page markers or we can identify page boundaries
#         # This part may need adjustment based on actual pymupdf4llm output format
#         page_sections = all_markdown.split("# Page ")
        
#         if len(page_sections) <= 1:
#             # If no clear page markers, return with warning
#             logging.warning("Could not identify page boundaries in markdown output. Returning all pages.")
#             return all_markdown
        
#         # First element is content before any page marker, usually empty
#         page_sections = page_sections[1:]  # Skip the first element (pre-page content)
        
#         selected_content = []
#         for i in page_indices:
#             if i < len(page_sections):
#                 # Re-add the page marker
#                 selected_content.append(f"# Page {page_sections[i]}")
        
#         markdown_text = "".join(selected_content)
    
#     logging.info("Completed markdown extraction.")
    
#     # Clean up temporary file if created
#     if not isinstance(pdf_input, str):
#         import os
#         try:
#             os.unlink(temp_path)
#         except:
#             logging.warning(f"Failed to delete temporary file: {temp_path}")
    
#     return markdown_text

In [5]:
import pandas as pd

# Get first table and convert to DataFrame
tab1 = tabs[0]
table_data = tab1.extract()

df = pd.DataFrame(table_data) # Convert list of lists to DataFrame
df # Display the DataFrame

Unnamed: 0,0,1,2,3,4,5
0,NET-33-06-20,Standard,Ø325 x 110mm,12W/14W/\n16W/18W,1200lm/1400lm/\n1600lm/1800lm,3000K/4000K/5500K
1,NET-33-06-30,Emergency/Self Test,,,,
2,NET-33-06-24,Microwave Sensor,,16W,1600lm,
3,NET-33-06-34,Microwave Sensor Emergency/Self Test,,,,
4,NET-33-06-70,Photocell,,12W/14W/\n16W/18W,1200lm/1400lm/\n1600lm/1800lm,
5,NET-33-06-71,Photocell Emergency/Self Test,,,,
6,NET-33-06-48,Photocell Microwave Sensor,,16W,1600lm,
7,NET-33-06-49,Photocell Microwave Sensor Emergency/Self Test,,,,


In [21]:
# import os
# from PyPDF2 import PdfReader, PdfWriter


# from modules.pdf_extraction import select_pdf_file
# from modules.pdf_extraction import extract_text_from_pages

# pdf_path = select_pdf_file()

# def split_pdf(input_path, output_path_1, output_path_2, split_page):
#     """
#     Split a PDF file into two separate PDF files.
    
#     Args:
#         input_path (str): Path to the input PDF file
#         output_path_1 (str): Path where to save the first part
#         output_path_2 (str): Path where to save the second part
#         split_page (int): The page number where to split (this page will be the first page of the second PDF)
#     """
#     try:
#         # Create PDF reader object
#         reader = PdfReader(input_path)
        
#         # Get total number of pages
#         total_pages = len(reader.pages)
        
#         if split_page >= total_pages:
#             raise ValueError("Split page number cannot be greater than total pages")
        
#         # Create two PDF writer objects
#         writer1 = PdfWriter()
#         writer2 = PdfWriter()
        
#         # Add pages to first output PDF (before split point)
#         for page in range(split_page):
#             writer1.add_page(reader.pages[page])
            
#         # Add pages to second output PDF (from split point to end)
#         for page in range(split_page, total_pages):
#             writer2.add_page(reader.pages[page])
            
#         # Save the first part
#         with open(output_path_1, 'wb') as output1:
#             writer1.write(output1)
            
#         # Save the second part
#         with open(output_path_2, 'wb') as output2:
#             writer2.write(output2)
            
#         return True
    
#     except Exception as e:
#         print(f"An error occurred: {str(e)}")
#         return False

# def main():
#     # Example usage
#     input_file = pdf_path  # Replace with your PDF file
#     output_file1 = "part1.pdf"
#     output_file2 = "part2.pdf"
#     split_at_page = 1000   # Split after first page (0-based index)
    
#     if os.path.exists(input_file):
#         success = split_pdf(input_file, output_file1, output_file2, split_at_page)
#         if success:
#             print(f"PDF split successfully!")
#             print(f"First part saved as: {output_file1}")
#             print(f"Second part saved as: {output_file2}")
#         else:
#             print("Failed to split PDF.")
#     else:
#         print(f"Input file '{input_file}' not found.")
        
# main()

