In [2]:
%pip install camelot-py[cv]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.




In [7]:
import camelot
print(camelot.__version__)
import sys
print(sys.executable)



1.0.0
C:\ProgramData\anaconda3\python.exe


In [6]:
import camelot
import os

def extract_tables(file_path, output_path, compress=False):
    """
    使用 Camelot 从 PDF 中提取表格，并导出为 CSV 文件。
    :param file_path: 输入 PDF 文件路径
    :param output_path: 输出 CSV 文件路径（将存储所有提取的表格）
    :param compress: 是否对输出进行压缩（默认 False）
    """
    try:
        # 提取所有页面的表格
        tables = camelot.read_pdf(file_path, flavor='lattice', pages='all')
        if not tables:
            print("未在 PDF 文件中检测到任何表格。")
            return
        
        # 如果输出目录不存在，则创建目录
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # 导出所有提取的表格到 CSV 文件
        tables.export(output_path, f='csv', compress=compress)
        print(f"表格已成功导出至 {output_path}")
    except Exception as e:
        print("在提取表格时发生错误：", e)

def main():
    # 在主函数中直接定义文件路径
    file_path = "2.02_EN.pdf"  # 输入 PDF 文件路径
    output_path = "output_lattice/table.csv"   # 输出 CSV 文件路径
    compress = False                   # 是否压缩输出文件

    if not os.path.isfile(file_path):
        print(f"输入文件 {file_path} 不存在。")
        return

    extract_tables(file_path, output_path, compress)

if __name__ == "__main__":
    main()


未在 PDF 文件中检测到任何表格。


In [9]:
!pip install PyPDF2
from PyPDF2 import PdfReader

reader = PdfReader("2.02_EN.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text() or ""
if text.strip():
    print("这是文本型 PDF")
else:
    print("这可能是图像型 PDF")


Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
这是文本型 PDF


In [12]:
%pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
import fitz  # PyMuPDF
import json

def extract_pdf_table_to_json(pdf_path, page_num=0, output_file="table.json"):
    """
    提取 PDF 指定页面的第一个表格并转换为 JSON
    :param pdf_path: PDF 文件路径
    :param page_num: 目标页面编号（默认第0页）
    :param output_file: 输出 JSON 文件名
    """
    doc = fitz.open(pdf_path)
    
    try:
        page = doc[page_num]
    except IndexError:
        print(f"错误：页面 {page_num} 不存在")
        return

    # 查找页面中的表格
    tables = page.find_tables()
    
    if len(tables.tables) == 0:
        print("未找到表格")
        return
    
    # 提取第一个表格数据
    table_data = tables[0].extract()
    
    # 转换为 JSON 格式（二维数组）
    json_data = {
        "table": table_data,
        "metadata": {
            "page": page_num + 1,  # 转换为人类可读页码
            "columns": len(table_data[0]),
            "rows": len(table_data)
        }
    }
    
    # 保存到文件
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    
    print(f"表格已保存至 {output_file}")

# 使用示例
extract_pdf_table_to_json(
    pdf_path="2.02_EN.pdf",
    page_num=0,
    output_file="output.json"
)

未找到表格


In [15]:
import fitz

def debug_pdf_tables(pdf_path, page_num=0):
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    
    # 查看页面文本结构
    print("页面原始文本结构:")
    print(page.get_text("dict"))
    
    # 检测所有表格
    tables = page.find_tables()
    print(f"\n检测到 {len(tables.tables)} 个表格候选区域")
    
    # 绘制表格识别框（可视化调试）
    for i, table in enumerate(tables.tables):
        print(f"表格 {i+1} 的边界框坐标:", table.bbox)
        # 在PDF上绘制红色框
        highlight = page.add_highlight_annot(table.bbox)
        highlight.set_colors(stroke=(1, 0, 0))  # 红色边框
        highlight.update()
    
    # 保存带标注的PDF
    doc.save("debug_tables.pdf")
    print("已生成调试文件: debug_tables.pdf")

debug_pdf_tables("2.02_EN.pdf", page_num=0)

页面原始文本结构:
{'width': 419.52801513671875, 'height': 595.2760009765625, 'blocks': [{'number': 1, 'type': 0, 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875), 'lines': [{'spans': [{'size': 14.0, 'flags': 20, 'bidi': 0, 'char_flags': 16, 'font': 'FrutigerLTStd-Bold', 'color': 13027014, 'alpha': 255, 'ascender': 0.9430000185966492, 'descender': -0.25, 'text': '2.02 Contributions', 'origin': (99.21260070800781, 78.53192138671875), 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875)}]}, {'number': 2, 'type': 0, 'bbox': (81.49610137939453, 110.29820251464844, 363.80706787109375, 169.0712890625), 'lines': [{'spans': [{'size': 16.0, 'flags': 20, 'bidi': 0, 'char_flags': 16, 'font': 'FrutigerLTStd-Bold', 'color': 1907995, 'alpha': 255, 'ascender': 0.9430000185966492, 'descender': -0.25, 'text': 'Self-employed 

In [16]:
%pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ----------------------------- ---------- 4.2/5.6 MB 22.9 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 21.4 MB/s eta 0:00:00
Installing collected packages: pdfminer.six, pdfplumber
  Attempting uninstall: pdfminer.six
    Found existing installation: pdfminer.six 20240706
    Uninstalling pdfminer.six-20240706:
      Successfully uninstalled pdfminer.six-20240706
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.5



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
camelot-py 1.0.0 requires pdfminer-six>=20240706, but you have pdfminer-six 20231228 which is incompatible.


In [18]:
import pdfplumber
import pandas as pd
with pdfplumber.open("2.02_EN.pdf") as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            # 直接转成DataFrame
            import pandas as pd
            df = pd.DataFrame(table[1:], columns=table[0])
            print(df)


  Year of birth Reference age
0    Up to 1960            64
Empty DataFrame
Columns: [1962, 64 plus 6 months]
Index: []
Empty DataFrame
Columns: [From 1964, 65]
Index: []
  Contribution rates    None
0               OASI   8.1 %
1                 DI   1.4 %
2                 IC   0.5 %
3              Total  10.0 %
   Annual income in francs           None  \
0              of at least  but less than   
1                   10 100         17 600   
2                   17 600         23 000   
3                   23 000         25 500   
4                   25 500         28 000   
5                   28 000         30 500   
6                   30 500         33 000   
7                   33 000         35 500   
8                   35 500         38 000   
9                   38 000         40 500   
10                  40 500         43 000   
11                  43 000         45 500   
12                  45 500         48 000   
13                  48 000         50 500   
14       

In [21]:
import pdfplumber
import json
import pandas as pd
from pathlib import Path

def tables_to_json(pdf_path, output_dir="output_tables"):
    """
    将PDF中所有表格转为统一JSON格式
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    """
    # 创建输出目录
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # 提取当前页所有表格
            tables = page.extract_tables()
            
            if not tables:
                print(f"页码 {page_num+1} 未检测到表格")
                continue
                
            for table_num, table in enumerate(tables, 1):
                # 清洗数据（处理None值和换行符）
                cleaned_table = [
                    [
                        cell.replace("\n", " ").strip() if cell is not None else ""
                        for cell in row
                    ]
                    for row in table
                ]   
                # 构建统一JSON结构
                json_data = {
                    "metadata": {
                        "source": pdf_path,
                        "page": page_num + 1,
                        "table_number": table_num,
                        "dimensions": {
                            "rows": len(cleaned_table),
                            "columns": len(cleaned_table[0]) if cleaned_table else 0
                        }
                    },
                    "data": {
                        "headers": cleaned_table[0] if cleaned_table else [],
                        "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
                    }
                }
                
                # 生成文件名
                filename = f"page_{page_num+1}_table_{table_num}.json"
                output_path = Path(output_dir) / filename
                
                # 保存JSON文件
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=2)
                
                print(f"已保存表格：{filename}")

# 使用示例
tables_to_json(
    pdf_path="2.02_EN.pdf",
    output_dir="plumber_json_output"
)

页码 1 未检测到表格
页码 2 未检测到表格
页码 3 未检测到表格
已保存表格：page_4_table_1.json
已保存表格：page_4_table_2.json
已保存表格：page_4_table_3.json
已保存表格：page_5_table_1.json
已保存表格：page_5_table_2.json
已保存表格：page_6_table_1.json
页码 7 未检测到表格
已保存表格：page_8_table_1.json
已保存表格：page_8_table_2.json
页码 9 未检测到表格
页码 10 未检测到表格
页码 11 未检测到表格
页码 12 未检测到表格


In [None]:
import fitz  # PyMuPDF
import json
import glob
from pathlib import Path

def process_pdf_tables(input_dir=".", output_base="output_tables"):
    """
    批量处理指定目录下的所有PDF文件表格
    :param input_dir: 输入目录路径（默认当前目录）
    :param output_base: 输出根目录（默认output_tables）
    """
    # 获取所有PDF文件（包括子目录）
    pdf_files = glob.glob(f"{input_dir}/**/*.pdf", recursive=True)
    
    if not pdf_files:
        print(f"在 {input_dir} 目录中未找到PDF文件")
        return

    for pdf_path in pdf_files:
        try:
            # 创建对应输出目录
            pdf_stem = Path(pdf_path).stem  # 获取不带扩展名的文件名
            output_dir = Path(output_base) / pdf_stem
            output_dir.mkdir(parents=True, exist_ok=True)
            
            print(f"\n正在处理文件：{Path(pdf_path).name}")
            
            with fitz.open(pdf_path) as doc:
                process_single_pdf(doc, pdf_path, output_dir)
                
        except Exception as e:
            print(f"处理文件 {pdf_path} 失败：{str(e)}")

def process_single_pdf(doc, pdf_path, output_dir):
    """处理单个PDF文件"""
    for page_index in range(len(doc)):
        page = doc[page_index]
        tables = page.find_tables()
        
        if not tables:
            print(f"  第 {page_index + 1} 页没有检测到表格")
            continue

        for table_num, table in enumerate(tables, start=1):
            try:
                # 提取表格数据
                table_data = table.extract()
                
                # 构建JSON结构
                json_data = {
                    "metadata": {
                        "source": str(pdf_path),
                        "page": page_index + 1,
                        "table_number": table_num,
                        "bbox": list(table.bbox),
                        "dimensions": {
                            "rows": len(table_data),
                            "columns": len(table_data[0]) if table_data else 0
                        }
                    },
                    "data": table_data
                }
                
                # 生成文件名
                filename = f"page_{page_index+1}_table_{table_num}.json"
                output_path = output_dir / filename
                
                # 保存文件
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=2)
                
                print(f"  已保存表格：{filename}")
                
            except Exception as e:
                print(f"  表格处理失败：{str(e)}")

# 使用示例（处理当前目录及其子目录下的所有PDF）
process_pdf_tables(
    input_dir=".", 
    output_base="output_tables"
)

已保存表格：page_4_table_1.json
已保存表格：page_5_table_1.json
已保存表格：page_5_table_2.json
已保存表格：page_6_table_1.json
已保存表格：page_8_table_1.json
已保存表格：page_8_table_2.json


# Camelot
Stream
Suitable for tables where whitespaces between cells simulate the table structure. It leverages PDFMiner's functionality to group characters into words and sentences, analyzing margins to infer table boundaries. Ideal for borderless tables but struggles with complex layouts.

Lattice
Designed for tables with explicit demarcation lines. Detects line segments and intersections via image processing (using OpenCV) to define precise table boundaries. Highly accurate for multi-table pages and merged cells but fails for borderless tables.

Network
Relies on text element bounding boxes to identify horizontal/vertical alignment patterns. Effective for tables without lines but with strong text alignment. Struggles with irregular or loosely structured layouts.

Hybrid
Combines Network's text alignment analysis with Lattice's line detection. Uses Lattice's precise boundaries to enhance Network results. Optimized for mixed-layout tables (partially lined + text-aligned) but computationally intensive.

While Camelot's official examples include similar tables, none of the four modes worked reliably for our specific case due to irregular text alignment and partial/no borders.


In [None]:
import camelot
import json
from pathlib import Path

def extract_tables_with_camelot(pdf_path, output_dir="camelot_output"):
    """
    使用Camelot提取表格并保存为JSON
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    """

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
  
    try:
        tables = camelot.read_pdf(
            pdf_path, 
            flavor="network",
            edge_tol=50,   
            row_tol=10      
        )
    except Exception as e:
        print(f"文件解析失败: {str(e)}")
        return

    if not tables:
        print("未检测到任何表格")
        return

    for table_num, table in enumerate(tables, 1):
        df = table.df
        cleaned_table = [
            [cell.strip().replace("\n", " ") if cell else "" 
             for cell in row
            ]
            for row in df.values.tolist()
        ]
             
        json_data = {
            "metadata": {
                "source": pdf_path,
                "page": table.page + 1,  
                "table_number": table_num,
                "accuracy": round(table.parsing_report["accuracy"], 2),
                "dimensions": {
                    "rows": len(cleaned_table),
                    "columns": len(cleaned_table[0]) if cleaned_table else 0
                }
            },
            "data": {
                "headers": cleaned_table[0] if cleaned_table else [],
                "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
            }
        }
        
        
        filename = f"page_{table.page+1}_table_{table_num}.json"
        output_path = Path(output_dir) / filename
        
        
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        
        print(f"已保存表格：{filename}")

extract_tables_with_camelot(
    pdf_path="2.01_EN.pdf",
    output_dir="camelot_json_output"
)

未检测到任何表格


In [None]:
import camelot
import json
from pathlib import Path
import matplotlib.pyplot as plt  

def extract_tables_with_camelot(pdf_path, output_dir="camelot_output", debug=False, visualize=False):
    """
    使用Camelot提取表格并保存为JSON，同时增加调试信息和可视化图像的保存
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    :param debug: 是否打印调试信息
    :param visualize: 是否生成并显示可视化图像
    """
    
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    
    try:
        tables = camelot.read_pdf(
            pdf_path, 
            flavor="network",  # 如遇到问题，可尝试改为 "lattice"
            edge_tol=50,    # 提高边缘检测敏感度
            row_tol=10      # 优化行间距识别
        )
    except Exception as e:
        print(f"文件解析失败: {str(e)}")
        return

    if not tables or len(tables) == 0:
        print("未检测到任何表格")
        return

    for table_num, table in enumerate(tables, 1):
        # 如果开启调试，打印当前表格的解析报告和其他信息
        if debug:
            print(f"------------------------------")
            print(f"处理第 {table_num} 个表格 (页面 {table.page + 1})")
            print("解析报告:", table.parsing_report)
            print("边界框:", table._bbox)  # 内部边界信息
        
        # 获取数据（自动处理合并单元格）
        df = table.df
        
        # 清洗数据（处理空值和换行符）
        cleaned_table = [
            [cell.strip().replace("\n", " ") if cell else "" 
             for cell in row
            ]
            for row in df.values.tolist()
        ]
        
        # 构建JSON结构
        json_data = {
            "metadata": {
                "source": pdf_path,
                "page": table.page + 1,  # Camelot页码从0开始
                "table_number": table_num,
                "accuracy": round(table.parsing_report.get("accuracy", 0), 2),
                "dimensions": {
                    "rows": len(cleaned_table),
                    "columns": len(cleaned_table[0]) if cleaned_table else 0
                }
            },
            "data": {
                "headers": cleaned_table[0] if cleaned_table else [],
                "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
            }
        }
        
        # 生成JSON文件名
        json_filename = f"page_{table.page+1}_table_{table_num}.json"
        json_output_path = Path(output_dir) / json_filename
        
        # 保存JSON文件
        with open(json_output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"已保存表格数据：{json_filename}")
        
        # 如果开启可视化，生成并保存表格的图像
        if visualize:
            try:
                # 使用 grid 图展示表格检测结果
                plot = camelot.plot(table, kind='grid', figsize=(10, 10))
                # 保存图像文件
                img_filename = f"debug_page_{table.page+1}_table_{table_num}.png"
                img_output_path = Path(output_dir) / img_filename
                plot.savefig(img_output_path)
                print(f"已保存调试图像：{img_filename}")
                # 显示图像
                plt.show()
            except Exception as e:
                print(f"可视化出错: {str(e)}")

# 使用示例
extract_tables_with_camelot(
    pdf_path="column_separators.pdf",
    output_dir="camelot_json_output",
    debug=True,      
    visualize=True   
)


------------------------------
处理第 1 个表格 (页面 2)
解析报告: {'accuracy': 100.0, 'whitespace': 33.91, 'order': 1, 'page': 1}
边界框: (31.56, 51.966239999999885, 738.46704, 590.64576)
已保存表格数据：page_2_table_1.json
可视化出错: PlotMethods.__call__() got an unexpected keyword argument 'figsize'


In [44]:
!echo LLAMA_CLOUD_API_KEY=llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ > .env

In [45]:
%pip install llama-index llama-parse python-dotenv
%pip install nest_asyncio
%pip install llama-cloud-services llama-index-core llama-index-readers-file

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [46]:
from dotenv import load_dotenv
import os

# check if load or not
load_dotenv()  
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
print("API Key:", api_key)  

API Key: llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ


In [58]:
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import nest_asyncio
nest_asyncio.apply() 


parser = LlamaParse(api_key=os.getenv("LLAMA_CLOUD_API_KEY"), result_type="markdown")
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_files=['2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data() 
print(len(documents))

Started parsing the file under job_id 0cc3d78b-11ef-4b3e-94fb-03dd024201dc
....18


In [62]:
import os
import io
import base64
import json
import re
from pathlib import Path
from dotenv import load_dotenv
from pdf2image import convert_from_path
from google.api_core.exceptions import InvalidArgument
import google.generativeai as genai
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import nest_asyncio
from pydantic import BaseModel, root_validator
from typing import List, Optional

# 应用 nest_asyncio 解决 Jupyter Notebook 环境下的异步问题
nest_asyncio.apply()

# 加载环境变量并配置 Gemini/LlamaParse API
load_dotenv()
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
print("API Key:", api_key)

# 定义 Pydantic 模型（结构化输出）
class CellContent(BaseModel):
    content: str

class Table(BaseModel):
    header_names: List[str]
    n_cols: int
    n_rows: int
    row_content: List[List[CellContent]]

class ParsedTable(BaseModel):
    tables: List[Table]
    description: str

class PageResult(BaseModel):
    page: int
    parsed_table: Optional[ParsedTable]
    error: Optional[str]
    raw_output: str

# 如果你需要定义外层文档结构，可以使用 RootModel（Pydantic v2 的方式），但这里直接使用 List[PageResult]即可。

# 定义一个清洗函数，去除返回文本中可能的 markdown 代码块标记
def clean_json_output(raw_text: str) -> str:
    raw_text = raw_text.strip()
    # 使用正则移除开头的 ```json 和结尾的 ```
    pattern = r"^```(?:json)?\s*|```$"
    cleaned = re.sub(pattern, "", raw_text, flags=re.DOTALL)
    return cleaned.strip()

# 定义用于提取表格的 prompt（要求输出纯 JSON，不含 markdown）
prompt = """<instructions>
  <context>
    You will receive an image converted from a PDF page. The image may contain table data as well as irrelevant elements such as logos, headers, and footers.
  </context>
  <goal>
    Please extract only the table data from the image and ignore any non-relevant content.
    Your tasks are:
      1. Identify and extract all tables.
      2. Discard any information that is not part of a table.
  </goal>
  <response_format>
    Please output the parsed result in pure JSON format with the following structure:
    {
      "tables": [
        {
          "header_names": ["Column1", "Column2", "..."],
          "n_cols": integer,
          "n_rows": integer,
          "row_content": [
            [ {"content": "Cell content"}, {"content": "Cell content"}, ... ],
            ...
          ]
        }
      ],
      "description": "Short description summarizing the table content."
    }
  </response_format>
  <additional_instructions>
    - Extract and return only table-related data, ignoring logos, headers, footers, etc.
    - Output must be in pure JSON format without any markdown code fences or additional commentary.
  </additional_instructions>
</instructions>"""

# 配置 LlamaParse，指定 result_type 为 "markdown"（如果模型不支持直接 json，则可能返回 markdown 格式，此时后处理函数会处理）
parser = LlamaParse(api_key=api_key, result_type="markdown")
# 建立文件提取器映射：扩展名为 .pdf 由 LlamaParse 处理
file_extractor = {".pdf": parser}

# 使用 SimpleDirectoryReader 读取 PDF 文件（你可以在 input_files 中指定多个文件）
documents = SimpleDirectoryReader(
    input_files=['2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data()
print("Documents loaded:", len(documents))

results = []  # 用于存储每一页的解析结果

# 遍历每个 Document（通常每个 Document 对应 PDF 的一页）
for i, doc in enumerate(documents):
    print(f"Processing document {i+1}/{len(documents)}...")
    raw_output = doc.text  # 假设 doc.text 包含 LlamaParse 返回的内容
    cleaned_output = clean_json_output(raw_output)
    try:
        # 尝试解析为 Python 对象
        page_json = json.loads(cleaned_output)
        # 利用 Pydantic 模型验证输出结构
        parsed_table = ParsedTable.parse_obj(page_json)
        results.append({
            "page": i + 1,
            "parsed_table": parsed_table.dict(),
            "raw_output": raw_output
        })
    except Exception as e:
        print(f"Document {i+1}: Error parsing output with Pydantic: {e}")
        results.append({
            "page": i + 1,
            "parsed_table": None,
            "error": str(e),
            "raw_output": raw_output
        })

# 保存结果到 JSON 文件
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "parsed_tables.json")
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print("JSON output saved at:", output_file)


API Key: llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ
Started parsing the file under job_id af6a0a1c-6d68-4e40-b6c4-622ca808a3e5
Documents loaded: 18
Processing document 1/18...
Document 1: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 2/18...
Document 2: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 3/18...
Document 3: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 4/18...
Document 4: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 5/18...
Document 5: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 6/18...
Document 6: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 7/18...
Document 7: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing docum

In [None]:
# 定义定制的 prompt，要求仅提取表格数据并输出为 JSON 格式
prompt = """<instructions>
  <context>
    You will receive an pdf and the pdf may contain table data as well as irrelevant elements such as logos, headers, and footers.
  </context>
  <goal>
    Please extract only the table data from the pdf and ignore any non-relevant content. Your tasks are:
    1. Identify and extract all tables.
    2. Discard any information that is not part of a table.
    3. Parse the table data and output the results in a JSON format.
    4. Provide a short description of the extracted table data.
  </goal>
  <response_format>
    Please output the parsed result in the following JSON structure:
    
    {
      "tables": [
        {
          "header_names": ["Column1", "Column2", "..."],
          "n_cols": integer,
          "n_rows": integer,
          "row_content": [
            { "content": "Cell content" },
            { "content": "Cell content" },
            ...
          ]
        }
      ],
      "description": "Short description"
    }
  </response_format>
  <additional_instructions>
    - Extract and return only table-related data, ensuring that all non-table elements (e.g., logos, headers, footers) are ignored.
    - The output must be in pure JSON format without any additional commentary.
    - You can use XML-like formatting in your instructions to enhance clarity.
  </additional_instructions>
</instructions>"""

# 实例化 LlamaParse 时传入自定义 prompt，要求结果为 JSON 格式输出
parser = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="json",
    prompt=prompt
)

# 配置文件提取器：对于 .pdf 文件使用我们定制的 parser
file_extractor = {".pdf": parser}

# 使用 SimpleDirectoryReader 读取 PDF 文件并解析数据
documents = SimpleDirectoryReader(
    input_files=['data/2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data()

print(f"Number of documents loaded: {len(documents)}")
for doc in documents:
    # 此处 doc.text 应该为纯 JSON 格式的字符串（如果 LLM 正常返回的话）
    print("Parsed Document:")
    print(doc.text)


Started parsing the file under job_id 948f40db-e2b7-4096-9bf8-c16d5309a9d5
Error while parsing the file '<bytes/buffer>': 'json'
Number of documents loaded: 0
