<a href="https://colab.research.google.com/github/Deku78/My-Website/blob/main/llama_parse_eg_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install llama-parse tabula-py pandas

In [None]:
%pip install nest_asyncio




In [2]:
import os
import nest_asyncio
from llama_parse import LlamaParse
import tabula
import pandas as pd
import io

# Apply nest_asyncio to avoid the event loop conflict
nest_asyncio.apply()

# Set the environment variable for API key
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-elu4ezXSHAfXr3cYXUk1Dua72ZyBCk3WCR4Xs7UkSBEqwlIH"

def extract_tables_from_pdf_llama(pdf_file_path):
    # Define the parsing instruction for table extraction
    parsing_instruction = """
    You are an expert NLP Table Extractor. Your task is to extract ALL tables from the given PDF and present them in Markdown format. Follow these guidelines:

    1. Identify and extract every table in the document, regardless of its structure or complexity.
    2. Convert each table to Markdown format, preserving its original structure as closely as possible.
    3. Do not use any specific Markdown table syntax. Instead, use spacing and alignment to represent the table structure.
    4. Handle tables that span multiple pages by combining them into a single coherent table.
    5. Accurately represent any merged cells, headers, or complex layouts in your Markdown representation.
    6. Include column headers if present.
    7. Separate multiple tables with a horizontal line (---) and a newline.
    8. Do not add any explanations or additional text. Only output the table content in Markdown format.

    Extract and present ALL tables found in the document, maintaining their original structure and content as accurately as possible in Markdown format.
    """

    # Load the PDF and apply the table-specific instructions
    llama_parser = LlamaParse(
        result_type="markdown",
        parsing_instruction=parsing_instruction
    )

    try:
        document = llama_parser.load_data(pdf_file_path)
        if document and len(document) > 0:
            return document[0].text
        else:
            print("No tables extracted from the PDF using LlamaParse.")
            return ""
    except Exception as e:
        print(f"Error extracting tables with LlamaParse: {str(e)}")
        return ""

def extract_tables_from_pdf_tabula(pdf_file_path):
    try:
        # Extract tables using Tabula
        tables = tabula.read_pdf(pdf_file_path, pages='all', multiple_tables=True)

        if not tables:
            print("No tables extracted from the PDF using Tabula.")
            return ""

        # Convert tables to Markdown format
        markdown_tables = []
        for table in tables:
            output = io.StringIO()
            table.to_markdown(output, index=False)
            markdown_tables.append(output.getvalue())

        # Join tables with separators
        return "\n\n---\n\n".join(markdown_tables)
    except Exception as e:
        print(f"Error extracting tables with Tabula: {str(e)}")
        return ""

def combine_extraction_results(llama_result, tabula_result):
    combined_result = ""

    if llama_result:
        combined_result += "LlamaParse Extraction:\n\n" + llama_result + "\n\n"

    if tabula_result:
        combined_result += "Tabula Extraction:\n\n" + tabula_result

    return combined_result

def process_pdfs(pdf_file_paths):
    for pdf_file_path in pdf_file_paths:
        print(f"\nProcessing PDF: {pdf_file_path}")

        llama_tables = extract_tables_from_pdf_llama(pdf_file_path)
        tabula_tables = extract_tables_from_pdf_tabula(pdf_file_path)

        combined_results = combine_extraction_results(llama_tables, tabula_tables)

        print("Extracted Tables:")
        print(combined_results)

# List of PDF file paths to process
pdf_file_paths = [
    "/content/sample_data/Sample1.1_Change Order.pdf",
]

process_pdfs(pdf_file_paths)





Processing PDF: /content/sample_data/Sample1.1_Change Order.pdf
Started parsing the file under job_id 9875b5ca-a6b3-475e-9aab-5b10fb489bbb




Extracted Tables:
LlamaParse Extraction:

```
CHANGE ORDER

Project Name:                            ABC Services                                      Project Manager:
Customer Name:                           ACME LLC                                  John Parman
john.parman@gmail.com
CDW Affiliate:                           Global Tech, LLC
Effective Date:                          August 12, 2021                                   Requesting Party:
Change ID:                               ABC-12458-004                                        John Parman


Tabula Extraction:

| Project Name:   | Unnamed: 0      | BC Services    | Unnamed: 1            | Project Manager:     |
|:----------------|:----------------|:---------------|:----------------------|:---------------------|
| Customer Name:  | ACME LLC        | CME LLC        | John Parman           | John Parma           |
|                 |                 |                | john.parman@gmail.com |                      |
| nan       