In [2]:
%pip install camelot-py[cv]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.




In [7]:
import camelot
print(camelot.__version__)
import sys
print(sys.executable)



1.0.0
C:\ProgramData\anaconda3\python.exe


In [6]:
import camelot
import os

def extract_tables(file_path, output_path, compress=False):
    """
    使用 Camelot 从 PDF 中提取表格，并导出为 CSV 文件。
    :param file_path: 输入 PDF 文件路径
    :param output_path: 输出 CSV 文件路径（将存储所有提取的表格）
    :param compress: 是否对输出进行压缩（默认 False）
    """
    try:
        # 提取所有页面的表格
        tables = camelot.read_pdf(file_path, flavor='lattice', pages='all')
        if not tables:
            print("未在 PDF 文件中检测到任何表格。")
            return
        
        # 如果输出目录不存在，则创建目录
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # 导出所有提取的表格到 CSV 文件
        tables.export(output_path, f='csv', compress=compress)
        print(f"表格已成功导出至 {output_path}")
    except Exception as e:
        print("在提取表格时发生错误：", e)

def main():
    # 在主函数中直接定义文件路径
    file_path = "2.02_EN.pdf"  # 输入 PDF 文件路径
    output_path = "output_lattice/table.csv"   # 输出 CSV 文件路径
    compress = False                   # 是否压缩输出文件

    if not os.path.isfile(file_path):
        print(f"输入文件 {file_path} 不存在。")
        return

    extract_tables(file_path, output_path, compress)

if __name__ == "__main__":
    main()


未在 PDF 文件中检测到任何表格。


In [9]:
!pip install PyPDF2
from PyPDF2 import PdfReader

reader = PdfReader("2.02_EN.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text() or ""
if text.strip():
    print("这是文本型 PDF")
else:
    print("这可能是图像型 PDF")


Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
这是文本型 PDF


In [12]:
%pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
import fitz  # PyMuPDF
import json

def extract_pdf_table_to_json(pdf_path, page_num=0, output_file="table.json"):
    """
    提取 PDF 指定页面的第一个表格并转换为 JSON
    :param pdf_path: PDF 文件路径
    :param page_num: 目标页面编号（默认第0页）
    :param output_file: 输出 JSON 文件名
    """
    doc = fitz.open(pdf_path)
    
    try:
        page = doc[page_num]
    except IndexError:
        print(f"错误：页面 {page_num} 不存在")
        return

    # 查找页面中的表格
    tables = page.find_tables()
    
    if len(tables.tables) == 0:
        print("未找到表格")
        return
    
    # 提取第一个表格数据
    table_data = tables[0].extract()
    
    # 转换为 JSON 格式（二维数组）
    json_data = {
        "table": table_data,
        "metadata": {
            "page": page_num + 1,  # 转换为人类可读页码
            "columns": len(table_data[0]),
            "rows": len(table_data)
        }
    }
    
    # 保存到文件
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    
    print(f"表格已保存至 {output_file}")

# 使用示例
extract_pdf_table_to_json(
    pdf_path="2.02_EN.pdf",
    page_num=0,
    output_file="output.json"
)

未找到表格


In [15]:
import fitz

def debug_pdf_tables(pdf_path, page_num=0):
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    
    # 查看页面文本结构
    print("页面原始文本结构:")
    print(page.get_text("dict"))
    
    # 检测所有表格
    tables = page.find_tables()
    print(f"\n检测到 {len(tables.tables)} 个表格候选区域")
    
    # 绘制表格识别框（可视化调试）
    for i, table in enumerate(tables.tables):
        print(f"表格 {i+1} 的边界框坐标:", table.bbox)
        # 在PDF上绘制红色框
        highlight = page.add_highlight_annot(table.bbox)
        highlight.set_colors(stroke=(1, 0, 0))  # 红色边框
        highlight.update()
    
    # 保存带标注的PDF
    doc.save("debug_tables.pdf")
    print("已生成调试文件: debug_tables.pdf")

debug_pdf_tables("2.02_EN.pdf", page_num=0)

页面原始文本结构:
{'width': 419.52801513671875, 'height': 595.2760009765625, 'blocks': [{'number': 1, 'type': 0, 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875), 'lines': [{'spans': [{'size': 14.0, 'flags': 20, 'bidi': 0, 'char_flags': 16, 'font': 'FrutigerLTStd-Bold', 'color': 13027014, 'alpha': 255, 'ascender': 0.9430000185966492, 'descender': -0.25, 'text': '2.02 Contributions', 'origin': (99.21260070800781, 78.53192138671875), 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (99.21260070800781, 65.32991790771484, 219.97662353515625, 82.03192138671875)}]}, {'number': 2, 'type': 0, 'bbox': (81.49610137939453, 110.29820251464844, 363.80706787109375, 169.0712890625), 'lines': [{'spans': [{'size': 16.0, 'flags': 20, 'bidi': 0, 'char_flags': 16, 'font': 'FrutigerLTStd-Bold', 'color': 1907995, 'alpha': 255, 'ascender': 0.9430000185966492, 'descender': -0.25, 'text': 'Self-employed 

In [16]:
%pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ----------------------------- ---------- 4.2/5.6 MB 22.9 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 21.4 MB/s eta 0:00:00
Installing collected packages: pdfminer.six, pdfplumber
  Attempting uninstall: pdfminer.six
    Found existing installation: pdfminer.six 20240706
    Uninstalling pdfminer.six-20240706:
      Successfully uninstalled pdfminer.six-20240706
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.5



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
camelot-py 1.0.0 requires pdfminer-six>=20240706, but you have pdfminer-six 20231228 which is incompatible.


In [18]:
import pdfplumber
import pandas as pd
with pdfplumber.open("2.02_EN.pdf") as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        for table in tables:
            # 直接转成DataFrame
            import pandas as pd
            df = pd.DataFrame(table[1:], columns=table[0])
            print(df)


  Year of birth Reference age
0    Up to 1960            64
Empty DataFrame
Columns: [1962, 64 plus 6 months]
Index: []
Empty DataFrame
Columns: [From 1964, 65]
Index: []
  Contribution rates    None
0               OASI   8.1 %
1                 DI   1.4 %
2                 IC   0.5 %
3              Total  10.0 %
   Annual income in francs           None  \
0              of at least  but less than   
1                   10 100         17 600   
2                   17 600         23 000   
3                   23 000         25 500   
4                   25 500         28 000   
5                   28 000         30 500   
6                   30 500         33 000   
7                   33 000         35 500   
8                   35 500         38 000   
9                   38 000         40 500   
10                  40 500         43 000   
11                  43 000         45 500   
12                  45 500         48 000   
13                  48 000         50 500   
14       

In [21]:
import pdfplumber
import json
import pandas as pd
from pathlib import Path

def tables_to_json(pdf_path, output_dir="output_tables"):
    """
    将PDF中所有表格转为统一JSON格式
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    """
    # 创建输出目录
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # 提取当前页所有表格
            tables = page.extract_tables()
            
            if not tables:
                print(f"页码 {page_num+1} 未检测到表格")
                continue
                
            for table_num, table in enumerate(tables, 1):
                # 清洗数据（处理None值和换行符）
                cleaned_table = [
                    [
                        cell.replace("\n", " ").strip() if cell is not None else ""
                        for cell in row
                    ]
                    for row in table
                ]   
                # 构建统一JSON结构
                json_data = {
                    "metadata": {
                        "source": pdf_path,
                        "page": page_num + 1,
                        "table_number": table_num,
                        "dimensions": {
                            "rows": len(cleaned_table),
                            "columns": len(cleaned_table[0]) if cleaned_table else 0
                        }
                    },
                    "data": {
                        "headers": cleaned_table[0] if cleaned_table else [],
                        "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
                    }
                }
                
                # 生成文件名
                filename = f"page_{page_num+1}_table_{table_num}.json"
                output_path = Path(output_dir) / filename
                
                # 保存JSON文件
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=2)
                
                print(f"已保存表格：{filename}")

# 使用示例
tables_to_json(
    pdf_path="2.02_EN.pdf",
    output_dir="plumber_json_output"
)

页码 1 未检测到表格
页码 2 未检测到表格
页码 3 未检测到表格
已保存表格：page_4_table_1.json
已保存表格：page_4_table_2.json
已保存表格：page_4_table_3.json
已保存表格：page_5_table_1.json
已保存表格：page_5_table_2.json
已保存表格：page_6_table_1.json
页码 7 未检测到表格
已保存表格：page_8_table_1.json
已保存表格：page_8_table_2.json
页码 9 未检测到表格
页码 10 未检测到表格
页码 11 未检测到表格
页码 12 未检测到表格


In [None]:
import fitz  # PyMuPDF
import json
import glob
from pathlib import Path

def process_pdf_tables(input_dir=".", output_base="output_tables"):
    """
    批量处理指定目录下的所有PDF文件表格
    :param input_dir: 输入目录路径（默认当前目录）
    :param output_base: 输出根目录（默认output_tables）
    """
    # 获取所有PDF文件（包括子目录）
    pdf_files = glob.glob(f"{input_dir}/**/*.pdf", recursive=True)
    
    if not pdf_files:
        print(f"在 {input_dir} 目录中未找到PDF文件")
        return

    for pdf_path in pdf_files:
        try:
            # 创建对应输出目录
            pdf_stem = Path(pdf_path).stem  # 获取不带扩展名的文件名
            output_dir = Path(output_base) / pdf_stem
            output_dir.mkdir(parents=True, exist_ok=True)
            
            print(f"\n正在处理文件：{Path(pdf_path).name}")
            
            with fitz.open(pdf_path) as doc:
                process_single_pdf(doc, pdf_path, output_dir)
                
        except Exception as e:
            print(f"处理文件 {pdf_path} 失败：{str(e)}")

def process_single_pdf(doc, pdf_path, output_dir):
    """处理单个PDF文件"""
    for page_index in range(len(doc)):
        page = doc[page_index]
        tables = page.find_tables()
        
        if not tables:
            print(f"  第 {page_index + 1} 页没有检测到表格")
            continue

        for table_num, table in enumerate(tables, start=1):
            try:
                # 提取表格数据
                table_data = table.extract()
                
                # 构建JSON结构
                json_data = {
                    "metadata": {
                        "source": str(pdf_path),
                        "page": page_index + 1,
                        "table_number": table_num,
                        "bbox": list(table.bbox),
                        "dimensions": {
                            "rows": len(table_data),
                            "columns": len(table_data[0]) if table_data else 0
                        }
                    },
                    "data": table_data
                }
                
                # 生成文件名
                filename = f"page_{page_index+1}_table_{table_num}.json"
                output_path = output_dir / filename
                
                # 保存文件
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=2)
                
                print(f"  已保存表格：{filename}")
                
            except Exception as e:
                print(f"  表格处理失败：{str(e)}")

# 使用示例（处理当前目录及其子目录下的所有PDF）
process_pdf_tables(
    input_dir=".", 
    output_base="output_tables"
)

已保存表格：page_4_table_1.json
已保存表格：page_5_table_1.json
已保存表格：page_5_table_2.json
已保存表格：page_6_table_1.json
已保存表格：page_8_table_1.json
已保存表格：page_8_table_2.json


# Camelot
Stream
Suitable for tables where whitespaces between cells simulate the table structure. It leverages PDFMiner's functionality to group characters into words and sentences, analyzing margins to infer table boundaries. Ideal for borderless tables but struggles with complex layouts.

Lattice
Designed for tables with explicit demarcation lines. Detects line segments and intersections via image processing (using OpenCV) to define precise table boundaries. Highly accurate for multi-table pages and merged cells but fails for borderless tables.

Network
Relies on text element bounding boxes to identify horizontal/vertical alignment patterns. Effective for tables without lines but with strong text alignment. Struggles with irregular or loosely structured layouts.

Hybrid
Combines Network's text alignment analysis with Lattice's line detection. Uses Lattice's precise boundaries to enhance Network results. Optimized for mixed-layout tables (partially lined + text-aligned) but computationally intensive.

While Camelot's official examples include similar tables, none of the four modes worked reliably for our specific case due to irregular text alignment and partial/no borders.


In [None]:
import camelot
import json
from pathlib import Path

def extract_tables_with_camelot(pdf_path, output_dir="camelot_output"):
    """
    使用Camelot提取表格并保存为JSON
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    """

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
  
    try:
        tables = camelot.read_pdf(
            pdf_path, 
            flavor="network",
            edge_tol=50,   
            row_tol=10      
        )
    except Exception as e:
        print(f"文件解析失败: {str(e)}")
        return

    if not tables:
        print("未检测到任何表格")
        return

    for table_num, table in enumerate(tables, 1):
        df = table.df
        cleaned_table = [
            [cell.strip().replace("\n", " ") if cell else "" 
             for cell in row
            ]
            for row in df.values.tolist()
        ]
             
        json_data = {
            "metadata": {
                "source": pdf_path,
                "page": table.page + 1,  
                "table_number": table_num,
                "accuracy": round(table.parsing_report["accuracy"], 2),
                "dimensions": {
                    "rows": len(cleaned_table),
                    "columns": len(cleaned_table[0]) if cleaned_table else 0
                }
            },
            "data": {
                "headers": cleaned_table[0] if cleaned_table else [],
                "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
            }
        }
        
        
        filename = f"page_{table.page+1}_table_{table_num}.json"
        output_path = Path(output_dir) / filename
        
        
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        
        print(f"已保存表格：{filename}")

extract_tables_with_camelot(
    pdf_path="2.01_EN.pdf",
    output_dir="camelot_json_output"
)

未检测到任何表格


In [None]:
import camelot
import json
from pathlib import Path
import matplotlib.pyplot as plt  

def extract_tables_with_camelot(pdf_path, output_dir="camelot_output", debug=False, visualize=False):
    """
    使用Camelot提取表格并保存为JSON，同时增加调试信息和可视化图像的保存
    :param pdf_path: PDF文件路径
    :param output_dir: 输出目录
    :param debug: 是否打印调试信息
    :param visualize: 是否生成并显示可视化图像
    """
    
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    
    try:
        tables = camelot.read_pdf(
            pdf_path, 
            flavor="network",  # 如遇到问题，可尝试改为 "lattice"
            edge_tol=50,    # 提高边缘检测敏感度
            row_tol=10      # 优化行间距识别
        )
    except Exception as e:
        print(f"文件解析失败: {str(e)}")
        return

    if not tables or len(tables) == 0:
        print("未检测到任何表格")
        return

    for table_num, table in enumerate(tables, 1):
        # 如果开启调试，打印当前表格的解析报告和其他信息
        if debug:
            print(f"------------------------------")
            print(f"处理第 {table_num} 个表格 (页面 {table.page + 1})")
            print("解析报告:", table.parsing_report)
            print("边界框:", table._bbox)  # 内部边界信息
        
        # 获取数据（自动处理合并单元格）
        df = table.df
        
        # 清洗数据（处理空值和换行符）
        cleaned_table = [
            [cell.strip().replace("\n", " ") if cell else "" 
             for cell in row
            ]
            for row in df.values.tolist()
        ]
        
        # 构建JSON结构
        json_data = {
            "metadata": {
                "source": pdf_path,
                "page": table.page + 1,  # Camelot页码从0开始
                "table_number": table_num,
                "accuracy": round(table.parsing_report.get("accuracy", 0), 2),
                "dimensions": {
                    "rows": len(cleaned_table),
                    "columns": len(cleaned_table[0]) if cleaned_table else 0
                }
            },
            "data": {
                "headers": cleaned_table[0] if cleaned_table else [],
                "rows": cleaned_table[1:] if len(cleaned_table) > 1 else []
            }
        }
        
        # 生成JSON文件名
        json_filename = f"page_{table.page+1}_table_{table_num}.json"
        json_output_path = Path(output_dir) / json_filename
        
        # 保存JSON文件
        with open(json_output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"已保存表格数据：{json_filename}")
        
        # 如果开启可视化，生成并保存表格的图像
        if visualize:
            try:
                # 使用 grid 图展示表格检测结果
                plot = camelot.plot(table, kind='grid', figsize=(10, 10))
                # 保存图像文件
                img_filename = f"debug_page_{table.page+1}_table_{table_num}.png"
                img_output_path = Path(output_dir) / img_filename
                plot.savefig(img_output_path)
                print(f"已保存调试图像：{img_filename}")
                # 显示图像
                plt.show()
            except Exception as e:
                print(f"可视化出错: {str(e)}")

# 使用示例
extract_tables_with_camelot(
    pdf_path="column_separators.pdf",
    output_dir="camelot_json_output",
    debug=True,      
    visualize=True   
)


------------------------------
处理第 1 个表格 (页面 2)
解析报告: {'accuracy': 100.0, 'whitespace': 33.91, 'order': 1, 'page': 1}
边界框: (31.56, 51.966239999999885, 738.46704, 590.64576)
已保存表格数据：page_2_table_1.json
可视化出错: PlotMethods.__call__() got an unexpected keyword argument 'figsize'


In [44]:
!echo LLAMA_CLOUD_API_KEY=llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ > .env

In [45]:
%pip install llama-index llama-parse python-dotenv
%pip install nest_asyncio
%pip install llama-cloud-services llama-index-core llama-index-readers-file

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [46]:
from dotenv import load_dotenv
import os

# check if load or not
load_dotenv()  
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
print("API Key:", api_key)  

API Key: llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ


In [58]:
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import nest_asyncio
nest_asyncio.apply() 


parser = LlamaParse(api_key=os.getenv("LLAMA_CLOUD_API_KEY"), result_type="markdown")
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_files=['2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data() 
print(len(documents))

Started parsing the file under job_id 0cc3d78b-11ef-4b3e-94fb-03dd024201dc
....18


In [62]:
import os
import io
import base64
import json
import re
from pathlib import Path
from dotenv import load_dotenv
from pdf2image import convert_from_path
from google.api_core.exceptions import InvalidArgument
import google.generativeai as genai
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import nest_asyncio
from pydantic import BaseModel, root_validator
from typing import List, Optional

# 应用 nest_asyncio 解决 Jupyter Notebook 环境下的异步问题
nest_asyncio.apply()

# 加载环境变量并配置 Gemini/LlamaParse API
load_dotenv()
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
print("API Key:", api_key)

# 定义 Pydantic 模型（结构化输出）
class CellContent(BaseModel):
    content: str

class Table(BaseModel):
    header_names: List[str]
    n_cols: int
    n_rows: int
    row_content: List[List[CellContent]]

class ParsedTable(BaseModel):
    tables: List[Table]
    description: str

class PageResult(BaseModel):
    page: int
    parsed_table: Optional[ParsedTable]
    error: Optional[str]
    raw_output: str

# 如果你需要定义外层文档结构，可以使用 RootModel（Pydantic v2 的方式），但这里直接使用 List[PageResult]即可。

# 定义一个清洗函数，去除返回文本中可能的 markdown 代码块标记
def clean_json_output(raw_text: str) -> str:
    raw_text = raw_text.strip()
    # 使用正则移除开头的 ```json 和结尾的 ```
    pattern = r"^```(?:json)?\s*|```$"
    cleaned = re.sub(pattern, "", raw_text, flags=re.DOTALL)
    return cleaned.strip()

# 定义用于提取表格的 prompt（要求输出纯 JSON，不含 markdown）
prompt = """<instructions>
  <context>
    You will receive an image converted from a PDF page. The image may contain table data as well as irrelevant elements such as logos, headers, and footers.
  </context>
  <goal>
    Please extract only the table data from the image and ignore any non-relevant content.
    Your tasks are:
      1. Identify and extract all tables.
      2. Discard any information that is not part of a table.
  </goal>
  <response_format>
    Please output the parsed result in pure JSON format with the following structure:
    {
      "tables": [
        {
          "header_names": ["Column1", "Column2", "..."],
          "n_cols": integer,
          "n_rows": integer,
          "row_content": [
            [ {"content": "Cell content"}, {"content": "Cell content"}, ... ],
            ...
          ]
        }
      ],
      "description": "Short description summarizing the table content."
    }
  </response_format>
  <additional_instructions>
    - Extract and return only table-related data, ignoring logos, headers, footers, etc.
    - Output must be in pure JSON format without any markdown code fences or additional commentary.
  </additional_instructions>
</instructions>"""

# 配置 LlamaParse，指定 result_type 为 "markdown"（如果模型不支持直接 json，则可能返回 markdown 格式，此时后处理函数会处理）
parser = LlamaParse(api_key=api_key, result_type="markdown")
# 建立文件提取器映射：扩展名为 .pdf 由 LlamaParse 处理
file_extractor = {".pdf": parser}

# 使用 SimpleDirectoryReader 读取 PDF 文件（你可以在 input_files 中指定多个文件）
documents = SimpleDirectoryReader(
    input_files=['2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data()
print("Documents loaded:", len(documents))

results = []  # 用于存储每一页的解析结果

# 遍历每个 Document（通常每个 Document 对应 PDF 的一页）
for i, doc in enumerate(documents):
    print(f"Processing document {i+1}/{len(documents)}...")
    raw_output = doc.text  # 假设 doc.text 包含 LlamaParse 返回的内容
    cleaned_output = clean_json_output(raw_output)
    try:
        # 尝试解析为 Python 对象
        page_json = json.loads(cleaned_output)
        # 利用 Pydantic 模型验证输出结构
        parsed_table = ParsedTable.parse_obj(page_json)
        results.append({
            "page": i + 1,
            "parsed_table": parsed_table.dict(),
            "raw_output": raw_output
        })
    except Exception as e:
        print(f"Document {i+1}: Error parsing output with Pydantic: {e}")
        results.append({
            "page": i + 1,
            "parsed_table": None,
            "error": str(e),
            "raw_output": raw_output
        })

# 保存结果到 JSON 文件
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "parsed_tables.json")
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print("JSON output saved at:", output_file)


API Key: llx-WQMylwiPgluWrsrbK8NHnNISeemscgGSjB6tiKdBJOeA4SzJ
Started parsing the file under job_id af6a0a1c-6d68-4e40-b6c4-622ca808a3e5
Documents loaded: 18
Processing document 1/18...
Document 1: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 2/18...
Document 2: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 3/18...
Document 3: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 4/18...
Document 4: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 5/18...
Document 5: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 6/18...
Document 6: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing document 7/18...
Document 7: Error parsing output with Pydantic: Expecting value: line 1 column 1 (char 0)
Processing docum

In [None]:
# 定义定制的 prompt，要求仅提取表格数据并输出为 JSON 格式
prompt = """<instructions>
  <context>
    You will receive an pdf and the pdf may contain table data as well as irrelevant elements such as logos, headers, and footers.
  </context>
  <goal>
    Please extract only the table data from the pdf and ignore any non-relevant content. Your tasks are:
    1. Identify and extract all tables.
    2. Discard any information that is not part of a table.
    3. Parse the table data and output the results in a JSON format.
    4. Provide a short description of the extracted table data.
  </goal>
  <response_format>
    Please output the parsed result in the following JSON structure:
    
    {
      "tables": [
        {
          "header_names": ["Column1", "Column2", "..."],
          "n_cols": integer,
          "n_rows": integer,
          "row_content": [
            { "content": "Cell content" },
            { "content": "Cell content" },
            ...
          ]
        }
      ],
      "description": "Short description"
    }
  </response_format>
  <additional_instructions>
    - Extract and return only table-related data, ensuring that all non-table elements (e.g., logos, headers, footers) are ignored.
    - The output must be in pure JSON format without any additional commentary.
    - You can use XML-like formatting in your instructions to enhance clarity.
  </additional_instructions>
</instructions>"""

# 实例化 LlamaParse 时传入自定义 prompt，要求结果为 JSON 格式输出
parser = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="json",
    prompt=prompt
)

# 配置文件提取器：对于 .pdf 文件使用我们定制的 parser
file_extractor = {".pdf": parser}

# 使用 SimpleDirectoryReader 读取 PDF 文件并解析数据
documents = SimpleDirectoryReader(
    input_files=['data/2.01_EN.pdf'],
    file_extractor=file_extractor
).load_data()

print(f"Number of documents loaded: {len(documents)}")
for doc in documents:
    # 此处 doc.text 应该为纯 JSON 格式的字符串（如果 LLM 正常返回的话）
    print("Parsed Document:")
    print(doc.text)


Started parsing the file under job_id 948f40db-e2b7-4096-9bf8-c16d5309a9d5
Error while parsing the file '<bytes/buffer>': 'json'
Number of documents loaded: 0


# Gemini

In [3]:
import os
import json
import re
from pathlib import Path
from dotenv import load_dotenv
import google.generativeai as genai
from pdf2image import convert_from_path
import base64
from io import BytesIO
from pydantic import BaseModel
from typing import List, Optional

# 加载环境变量并配置 Gemini API
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

# 初始化 Gemini Pro Vision 模型
model = genai.GenerativeModel('gemini-pro-vision')

# 定义 Pydantic 模型
class Metadata(BaseModel):
    source: str
    page: int
    table_number: int
    dimensions: dict
    

class TableData(BaseModel):
    headers: List[str]
    rows: List[List[Optional[str]]]

class TableOutput(BaseModel):
    metadata: Metadata
    data: TableData

def convert_pdf_to_images(pdf_path: str) -> List[Path]:
    """将PDF转换为图片"""
    images = convert_from_path(pdf_path)
    image_paths = []
    
    output_dir = Path("temp_images")
    output_dir.mkdir(exist_ok=True)
    
    for i, image in enumerate(images):
        image_path = output_dir / f"page_{i+1}.png"
        image.save(image_path)
        image_paths.append(image_path)
    
    return image_paths

def get_image_data(image_path: str) -> str:
    """将图片转换为base64编码"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode()

def extract_tables_from_image(image_path: str, page_num: int) -> dict:
    """使用Gemini提取图片中的表格"""
    try:
        image_data = get_image_data(image_path)
        
        # 构建结构化提示词
        prompt = """你是一个专业的表格识别助手。请仔细分析图片中的表格内容。

任务：
1. 识别并提取表格的所有内容
2. 确定表格的维度（行数和列数）
3. 分离表头和数据行
4. 保持原始数据的完整性和准确性

输出要求：
- 使用JSON格式
- 保持数据的层次结构
- 确保所有单元格内容完整捕获
- 处理空单元格时使用null

示例输出格式：
{
  "metadata": {
    "source": "输入的PDF文件名",
    "page": 当前页码,
    "table_number": 当前页中的表格序号,
    "dimensions": {
      "rows": 实际行数,
      "columns": 实际列数
    }
  },
  "data": {
    "headers": ["完整的表头1", "完整的表头2", ...],
    "rows": [
      ["第1行第1列", "第1行第2列", ...],
      ["第2行第1列", "第2行第2列", ...],
      ...
    ]
  }
}"""

        # 配置生成参数
        generation_config = {
            "temperature": 0.1,  # 降低随机性
            "top_p": 0.8,
            "top_k": 40
        }
        
        # 调用Gemini API
        response = model.generate_content(
            contents=[prompt, image_data],
            generation_config=generation_config
        )
        
        # 解析响应
        result = json.loads(response.text)
        
        # 使用Pydantic验证
        validated_output = TableOutput(**result)
        return validated_output.dict()
        
    except Exception as e:
        print(f"处理页面 {page_num} 时出错: {str(e)}")
        return None

def process_pdf(pdf_path: str) -> List[dict]:
    """处理整个PDF文件"""
    results = []
    image_paths = convert_pdf_to_images(pdf_path)
    
    for i, image_path in enumerate(image_paths):
        print(f"正在处理第 {i+1} 页...")
        result = extract_tables_from_image(str(image_path), i+1)
        if result:
            results.append(result)
    
    return results

# 主程序
if __name__ == "__main__":
    pdf_path = "2.01_EN.pdf"
    results = process_pdf(pdf_path)
    
    # 保存结果
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)
    
    with open(output_dir / "parsed_tables.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print("表格解析完成，结果已保存到 output/parsed_tables.json")

正在处理第 1 页...
处理页面 1 时出错: 403 Permission denied: Consumer 'api_key:AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ' has been suspended. [reason: "CONSUMER_SUSPENDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "containerInfo"
  value: "api_key:AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ"
}
metadata {
  key: "consumer"
  value: "projects/750767877696"
}
, locale: "en-US"
message: "Permission denied: Consumer \'api_key:AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ\' has been suspended."
]
正在处理第 2 页...
处理页面 2 时出错: 403 Permission denied: Consumer 'api_key:AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ' has been suspended. [reason: "CONSUMER_SUSPENDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "containerInfo"
  value: "api_key:AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ"
}
metadata {
  key: "consumer"
  value: "projects/750767877696"
}
, locale: "en-US"
message: "Permissio

In [15]:
import google.generativeai as genai

from google.generativeai import GenerativeModel

In [1]:
%pip install "google-genai>=1"

Defaulting to user installation because normal site-packages is not writeable
Collecting google-genai>=1
  Downloading google_genai-1.3.0-py3-none-any.whl.metadata (28 kB)
Collecting httpx<1.0.0dev,>=0.28.1 (from google-genai>=1)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting websockets<15.0dev,>=13.0 (from google-genai>=1)
  Downloading websockets-14.2-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Downloading google_genai-1.3.0-py3-none-any.whl (137 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Downloading websockets-14.2-cp312-cp312-win_amd64.whl (164 kB)
Installing collected packages: websockets, httpx, google-genai
  Attempting uninstall: httpx
    Found existing installation: httpx 0.27.2
    Uninstalling httpx-0.27.2:
      Successfully uninstalled httpx-0.27.2
Successfully installed google-genai-1.3.0 httpx-0.28.1 websockets-14.2
Note: you may need to restart the kernel to use updated packages.




In [18]:
from google import genai
# Create a client
api_key = "AIzaSyA_ToQSQnQVA0eM2Pnjzi0Zz2c4utJk-TE"
client = genai.Client(api_key=api_key)
 
# Define the model you are going to use
model_id =  "gemini-2.0-flash" # or "gemini-2.0-flash-lite-preview-02-05"  , "gemini-2.0-pro-exp-02-05"

In [None]:
import requests

# 下载文件函数
def download_file(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"已成功下载: {filename}")
    else:
        print(f"下载失败: {filename}")

# 下载示例PDF文件
download_file(
    "https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/handwriting_form.pdf",
    "handwriting_form.pdf"
)
download_file(
    "https://storage.googleapis.com/generativeai-downloads/data/pdf_structured_outputs/invoice.pdf",
    "invoice.pdf"
)

# Newest version of gemini_pydantic

In [45]:
%pip install -U -q "google-genai"

Note: you may need to restart the kernel to use updated packages.


In [54]:
from google import genai
from google.genai.types import HttpOptions

client = genai.Client(
    project="gen-lang-client-0117133250",
    location="us-central1",
    vertexai=True,
    http_options=HttpOptions(api_version="v1"))
response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    contents="How does AI work?",
)
print(response.text)
# Example response:
# Okay, let's break down how AI works. It's a broad field, so I'll focus on the ...
#
# Here's a simplified overview:
# ...

Artificial Intelligence (AI) is a broad field that aims to create machines that can perform tasks that typically require human intelligence. Instead of just following pre-programmed rules, AI systems learn from data and adapt to new situations. Here's a breakdown of how it works, simplified into key concepts:

**1. Core Idea: Learning from Data**

*   AI's fundamental principle is to learn from data rather than being explicitly programmed for every possible scenario. The more data an AI system is exposed to, the better it becomes at identifying patterns, making predictions, and solving problems.

**2. Key Components & Techniques:**

*   **Machine Learning (ML):** This is the most common and widely used approach in AI. ML algorithms allow computers to learn from data without being explicitly programmed.
    *   **Supervised Learning:** The algorithm is trained on a labeled dataset, meaning each input is paired with the correct output (e.g., images of cats labeled as "cat"). The algorith

In [None]:
%pip install "google-genai>=1"
%pip install requests

In [81]:
from google import genai
from google.genai import types
client = genai.Client(api_key='AIzaSyC4kSTD0mAJM0zUzrUWXKOwuUj2G4qGrkQ')

# client = genai.Client(
#     vertexai=True, 
#     project='your-project-id', 
#     location='us-central1',
#     http_options=types.HttpOptions(api_version='v1')
# )


# 上传本地文件
file = client.files.upload(file='2.02_EN.pdf', config={'display_name': '2.02_EN.pdf'})
import requests

# def download_file(url, filename):
#     response = requests.get(url)
#     if response.status_code == 200:
#         with open(filename, 'wb') as f:
#             f.write(response.content)
#         print(f"已成功下载: {filename}")
#     else:
#         print(f"下载失败: {filename}")

# # 下载文件
# download_file(
#     "https://storage.cloud.google.com/table_processing/2.02_EN.pdf?authuser=1",
#     "2.02_EN.pdf"
# )

file_pdf = client.files.upload(file='2.02_EN.pdf', config={'display_name': '2.02_EN'})
response = client.models.generate_content(
    model='gemini-2.0-flash-001',
    contents=['Could you summarize this file?', file]
)
print(response.text)
 

This file is a leaflet about self-employed contributions to Old-Age and Survivors' Insurance (OASI), Disability Insurance (DI) and Income Compensation Insurance (IC) in Switzerland. It covers topics such as who is considered self-employed, the obligation to pay contributions, contribution amounts, how contributions are calculated, contributions on account, final contributions, payment of contributions, default interest, credit interest, contributions by OASI pensioners, and contributions on IC compensation and on daily allowances paid by DI, UI and military insurance. The leaflet is valid as of 1st January 2025.



In [82]:
file_size = client.models.count_tokens(model=model_id,contents=file_pdf)
print(f'File: {file_pdf.display_name} equals to {file_size.total_tokens} tokens')
# File: invoice equals to 821 tokens

File: 2.02_EN equals to 3097 tokens


In [83]:
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from typing import List, Union, Optional
import json
# 定义表格单元格的数据类型
class CellValue(BaseModel):
    value: Union[str, int, float] = Field(description="单元格的值，可以是字符串、整数或浮点数")
    data_type: str = Field(description="数据类型：text, number, date 等")

# 定义表格元数据
class TableMetadata(BaseModel):
    table_id: str = Field(description="表格的唯一标识符")
    pdf_name: str = Field(description="表格所在的PDF文件名")
    summary: Optional[str] = Field(default=None, description="表格概述")
    title: Optional[str] = Field(default=None, description="表格标题")
    language: str = Field(description="表格内容的语言，如：de, fr, it, en")
    source_url: Optional[str] = Field(default=None, description="表格来源的URL")
    page_number: Optional[int] = Field(default=None, description="PDF页码")
    total_rows: int = Field(description="总行数")
    total_columns: int = Field(description="总列数")

# 定义表格数据
class TableData(BaseModel):
    headers: List[str] = Field(description="表格列标题")
    rows: List[List[CellValue]] = Field(description="表格数据行")

# 定义完整表格
class Table(BaseModel):
    metadata: TableMetadata = Field(description="表格元数据")
    data: TableData = Field(description="表格内容数据")

In [98]:
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from typing import List, Union, Optional, Any
import json
import base64
import os

# 定义表格单元格的数据类型 - 修改value类型为Any以容纳可能的None值
class CellValue(BaseModel):
    value: Optional[Any] = Field(description="单元格的值，可以是字符串、整数、浮点数或null")
    data_type: str = Field(description="数据类型：text, number, date 等")

# 定义表格元数据
class TableMetadata(BaseModel):
    table_id: str = Field(description="表格的唯一标识符")
    pdf_name: str = Field(description="表格所在的PDF文件名")
    summary: Optional[str] = Field(default=None, description="表格概述")
    title: Optional[str] = Field(default=None, description="表格标题")
    language: str = Field(description="表格内容的语言，如：de, fr, it, en")
    source_url: Optional[str] = Field(default=None, description="表格来源的URL")
    page_number: Optional[int] = Field(default=None, description="PDF页码")
    total_rows: int = Field(description="总行数")
    total_columns: int = Field(description="总列数")

# 定义表格数据
class TableData(BaseModel):
    headers: List[Optional[str]] = Field(description="表格列标题")
    rows: List[List[CellValue]] = Field(description="表格数据行")

# 定义完整表格
class Table(BaseModel):
    metadata: TableMetadata = Field(description="表格元数据")
    data: TableData = Field(description="表格内容数据")

def parse_pdf_to_table(pdf_bytes: bytes, pdf_name: str) -> Table:
    # 初始化客户端
    api_key = "AIzaSyA_ToQSQnQVA0eM2Pnjzi0Zz2c4utJk-TE"
    client = genai.Client(api_key=api_key)
    
    # 获取Pydantic模型的JSON schema
    table_schema = Table.model_json_schema()
    
    # 构造提示词：嵌入JSON Schema
    instruction = f"""
    请解析下方PDF中的表格，并严格按以下JSON格式返回数据：
    {json.dumps(table_schema, indent=2)}
    
    附加要求：
    1. metadata.pdf_name 必须为 {pdf_name}
    2. 若表格缺失或无法解析，返回错误字段 "error"
    3. 请注意所有单元格值可以为null，不要留空值
    """
    
    # 创建文本部分
    text_part = types.Part(text=instruction)
    
    # 创建PDF部分
    pdf_part = types.Part(
        inline_data=types.Blob(
            mime_type="application/pdf",
            data=pdf_bytes
        )
    )
    
    # 创建内容
    contents = [
        text_part,
        pdf_part
    ]
    
    # 使用正确的API调用方式
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=contents,
        config=types.GenerateContentConfig(
            response_mime_type="application/json"
        )
    )
    
    # 解析响应
    try:
        # 获取响应文本
        response_text = response.text
        
        # 输出原始响应到文件
        with open("raw_response.json", "w", encoding="utf-8") as f:
            f.write(response_text)
        print(f"原始响应已保存到 raw_response.json")
        
        # 验证返回的是有效JSON
        json_data = json.loads(response_text)
        
        # 检查返回的是列表还是字典
        if isinstance(json_data, list) and len(json_data) > 0:
            # 如果返回的是列表，取第一个元素
            json_data = json_data[0]
        
        # 清理数据，确保没有验证错误
        if "data" in json_data and "rows" in json_data["data"]:
            for row_idx, row in enumerate(json_data["data"]["rows"]):
                for cell_idx, cell in enumerate(row):
                    # 确保每个单元格都有value和data_type
                    if "value" not in cell:
                        cell["value"] = None
                    if "data_type" not in cell:
                        cell["data_type"] = "text"
        
        # 使用Pydantic解析
        table = Table.model_validate(json_data)
        
        # 将解析后的表格保存为漂亮的JSON
        with open("parsed_table.json", "w", encoding="utf-8") as f:
            f.write(table.model_dump_json(indent=2, exclude_none=True))
        print(f"解析后的表格数据已保存到 parsed_table.json")
        
        # 将表格数据导出为HTML以便查看
        html_content = table_to_html(table)
        with open("table_preview.html", "w", encoding="utf-8") as f:
            f.write(html_content)
        print(f"表格HTML预览已保存到 table_preview.html")
        
        return table
    except Exception as e:
        raise ValueError(f"解析失败: {str(e)}")

def table_to_html(table: Table) -> str:
    """将表格转换为HTML格式"""
    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>{table.metadata.title or 'PDF表格解析结果'}</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            h1 {{ color: #333; }}
            table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            .metadata {{ background-color: #f9f9f9; padding: 10px; border-radius: 5px; margin-bottom: 20px; }}
            .metadata p {{ margin: 5px 0; }}
        </style>
    </head>
    <body>
        <h1>{table.metadata.title or 'PDF表格解析结果'}</h1>
        
        <div class="metadata">
            <h3>元数据</h3>
            <p><strong>表格ID:</strong> {table.metadata.table_id}</p>
            <p><strong>PDF文件:</strong> {table.metadata.pdf_name}</p>
            <p><strong>语言:</strong> {table.metadata.language}</p>
            <p><strong>总行数:</strong> {table.metadata.total_rows}</p>
            <p><strong>总列数:</strong> {table.metadata.total_columns}</p>
            {f'<p><strong>概述:</strong> {table.metadata.summary}</p>' if table.metadata.summary else ''}
            {f'<p><strong>页码:</strong> {table.metadata.page_number}</p>' if table.metadata.page_number is not None else ''}
        </div>
        
        <h3>表格数据</h3>
        <table>
            <thead>
                <tr>
    """
    
    # 添加表头
    for header in table.data.headers:
        html += f"<th>{header or ''}</th>"
    
    html += """
                </tr>
            </thead>
            <tbody>
    """
    
    # 添加表格内容
    for row in table.data.rows:
        html += "<tr>"
        for cell in row:
            # 处理不同类型的单元格值
            cell_value = cell.value if cell.value is not None else ""
            html += f"<td>{cell_value}</td>"
        html += "</tr>"
    
    html += """
            </tbody>
        </table>
    </body>
    </html>
    """
    
    return html

# -------------------------- 
# 使用示例
# -------------------------- 
if __name__ == "__main__":
    # 读取PDF文件
    with open("2.02_EN.pdf", "rb") as f:
        pdf_bytes = f.read()
    
    # 调用函数
    try:
        table = parse_pdf_to_table(pdf_bytes, "2.02_EN.pdf")
        print("\n解析完成！")
    except ValueError as e:
        print(f"错误: {e}")

原始响应已保存到 raw_response.json
解析后的表格数据已保存到 parsed_table.json
表格HTML预览已保存到 table_preview.html

解析完成！


# With Description

In [101]:
# from google import genai
# from google.genai import types
# from pydantic import BaseModel, Field
# from typing import List, Union, Optional, Any, Dict
# import json
# import base64
# import os
# import time

# # 定义表格单元格的数据类型 - 修改value类型为Any以容纳可能的None值
# class CellValue(BaseModel):
#     value: Optional[Any] = Field(description="单元格的值，可以是字符串、整数、浮点数或null")
#     data_type: str = Field(description="数据类型：text, number, date 等")

# # 定义表格元数据
# class TableMetadata(BaseModel):
#     table_id: str = Field(description="表格的唯一标识符")
#     pdf_name: str = Field(description="表格所在的PDF文件名")
#     summary: Optional[str] = Field(default=None, description="表格概述")
#     title: Optional[str] = Field(default=None, description="表格标题")
#     language: str = Field(description="表格内容的语言，如：de, fr, it, en")
#     source_url: Optional[str] = Field(default=None, description="表格来源的URL")
#     page_number: Optional[int] = Field(default=None, description="PDF页码")
#     total_rows: int = Field(description="总行数")
#     total_columns: int = Field(description="总列数")

# # 定义表格数据
# class TableData(BaseModel):
#     headers: List[Optional[str]] = Field(description="表格列标题")
#     rows: List[List[CellValue]] = Field(description="表格数据行")

# # 定义完整表格
# class Table(BaseModel):
#     metadata: TableMetadata = Field(description="表格元数据")
#     data: TableData = Field(description="表格内容数据")

# # 定义多个表格的列表，用于整个PDF的处理结果
# class PdfTables(BaseModel):
#     tables: List[Table] = Field(description="PDF中所有表格的列表")
#     total_tables: int = Field(description="表格总数")

# def generate_table_title(client, table: Table) -> str:
#     """使用Gemini API生成表格的描述性标题"""
#     try:
#         # 准备表格数据的文本表示
#         headers_text = ", ".join([h for h in table.data.headers if h])
        
#         # 提取前几行数据作为示例
#         sample_rows = []
#         for row_idx, row in enumerate(table.data.rows):
#             if row_idx >= 3:  # 只取前3行
#                 break
#             row_text = ", ".join([str(cell.value) for cell in row if cell.value is not None])
#             sample_rows.append(row_text)
        
#         rows_text = "\n".join(sample_rows)
        
#         # 构造提示词
#         prompt = f"""
#         请为以下表格生成一个简短、准确、具体的标题，描述表格的主题内容。
#         不要使用"表格"、"数据"等通用词作为标题的开头。
        
#         表格头部: {headers_text}
#         表格数据样例:
#         {rows_text}
#         页码: {table.metadata.page_number or '未知'}
        
#         表格标题:
#         """
        
#         # 调用API生成标题
#         response = client.models.generate_content(
#             model="gemini-2.0-flash",
#             contents=prompt,
#             config=types.GenerateContentConfig(
#                 max_output_tokens=50,  # 限制输出长度
#                 temperature=0.2  # 降低随机性
#             )
#         )
        
#         # 提取并清理标题
#         title = response.text.strip()
#         # 如果标题有引号，去掉引号
#         if title.startswith('"') and title.endswith('"'):
#             title = title[1:-1]
#         if title.startswith("'") and title.endswith("'"):
#             title = title[1:-1]
            
#         return title if title else "未命名表格"
    
#     except Exception as e:
#         print(f"生成表格标题时出错: {str(e)}")
#         return "未命名表格"

# def parse_pdf_tables(pdf_bytes: bytes, pdf_name: str) -> PdfTables:
#     """解析PDF中的所有表格"""
#     # 初始化客户端
#     api_key = "AIzaSyA_ToQSQnQVA0eM2Pnjzi0Zz2c4utJk-TE"
#     client = genai.Client(api_key=api_key)
    
#     # 获取Pydantic模型的JSON schema
#     table_schema = Table.model_json_schema()
    
#     # 构造提示词：嵌入JSON Schema，强调提取所有表格
#     instruction = f"""
#     请依次解析PDF中的所有表格，按照以下JSON格式返回每个表格的数据。
#     这个PDF包含多页和多个表格，请完整提取每一个表格，不要遗漏。
    
#     表格结构定义：
#     {json.dumps(table_schema, indent=2)}
    
#     请返回一个JSON数组，每个元素代表一个表格，格式必须符合上述模式。
    
#     附加要求：
#     1. 每个表格的metadata.pdf_name必须为 {pdf_name}
#     2. 每个表格的metadata.table_id应该是从"table_1"开始递增编号
#     3. 每个表格的metadata.page_number必须指明表格出现的PDF页码
#     4. 请为每个表格创建一个metadata.title，描述表格的主题
#     5. 请注意所有单元格值可以为null，不要留空值
#     6. 请提取所有页面中的所有表格，不要遗漏任何一个
#     """
    
#     # 创建文本部分
#     text_part = types.Part(text=instruction)
    
#     # 创建PDF部分
#     pdf_part = types.Part(
#         inline_data=types.Blob(
#             mime_type="application/pdf",
#             data=pdf_bytes
#         )
#     )
    
#     # 创建内容
#     contents = [
#         text_part,
#         pdf_part
#     ]
    
#     # 使用更长的最大输出token数，以确保能够返回所有表格
#     response = client.models.generate_content(
#         model="gemini-2.0-flash",
#         contents=contents,
#         config=types.GenerateContentConfig(
#             response_mime_type="application/json",
#             max_output_tokens=8192  # 增加输出长度限制
#         )
#     )
    
#     # 解析响应
#     try:
#         # 获取响应文本
#         response_text = response.text
        
#         # 输出原始响应到文件
#         with open("raw_response.json", "w", encoding="utf-8") as f:
#             f.write(response_text)
#         print(f"原始响应已保存到 raw_response.json")
        
#         # 验证返回的是有效JSON
#         json_data = json.loads(response_text)
        
#         # 确保json_data是列表
#         if not isinstance(json_data, list):
#             json_data = [json_data]
        
#         # 初始化表格列表
#         tables = []
        
#         # 处理每个表格
#         for table_idx, table_json in enumerate(json_data):
#             try:
#                 # 清理数据，确保没有验证错误
#                 if "data" in table_json and "rows" in table_json["data"]:
#                     for row_idx, row in enumerate(table_json["data"]["rows"]):
#                         for cell_idx, cell in enumerate(row):
#                             # 确保每个单元格都有value和data_type
#                             if "value" not in cell:
#                                 cell["value"] = None
#                             if "data_type" not in cell:
#                                 cell["data_type"] = "text"
                
#                 # 使用Pydantic解析单个表格
#                 table = Table.model_validate(table_json)
                
#                 # 如果表格没有标题，生成一个
#                 if not table.metadata.title:
#                     title = generate_table_title(client, table)
#                     table.metadata.title = title
#                     print(f"为表格 {table_idx+1} 生成标题: {title}")
                
#                 tables.append(table)
#             except Exception as e:
#                 print(f"警告: 表格 {table_idx+1} 解析失败: {str(e)}")
        
#         # 创建完整的PdfTables对象
#         pdf_tables = PdfTables(
#             tables=tables,
#             total_tables=len(tables)
#         )
        
#         # 将解析后的所有表格保存为JSON
#         with open("parsed_tables.json", "w", encoding="utf-8") as f:
#             f.write(pdf_tables.model_dump_json(indent=2, exclude_none=True))
#         print(f"解析后的所有表格数据已保存到 parsed_tables.json")
        
#         # 将每个表格生成单独的HTML文件
#         os.makedirs("table_previews", exist_ok=True)
#         for idx, table in enumerate(tables):
#             html_content = table_to_html(table, idx+1)
#             filename = f"table_previews/table_{idx+1}.html"
#             with open(filename, "w", encoding="utf-8") as f:
#                 f.write(html_content)
        
#         # 创建一个索引HTML文件，列出所有表格
#         create_index_html(tables)
#         print(f"表格HTML预览已保存到 table_previews/ 目录")
        
#         return pdf_tables
        
#     except Exception as e:
#         # 如果整体解析失败，尝试分批处理
#         print(f"整体解析失败，尝试按页处理: {str(e)}")
#         return parse_pdf_tables_by_pages(client, pdf_bytes, pdf_name)

# def parse_pdf_tables_by_pages(client, pdf_bytes: bytes, pdf_name: str) -> PdfTables:
#     """按页解析PDF中的表格，用于处理大型PDF"""
#     tables = []
#     table_id_counter = 1
    
#     # 尝试最多处理30页
#     for page in range(1, 31):
#         try:
#             # 构造提示词
#             page_instruction = f"""
#             请解析PDF第{page}页中的所有表格，按照JSON格式返回。
#             如果该页没有表格，请返回空数组[]。
            
#             附加要求：
#             1. 每个表格的metadata.pdf_name必须为 {pdf_name}
#             2. 每个表格的metadata.table_id应该是"table_{table_id_counter}"
#             3. 每个表格的metadata.page_number必须为 {page}
#             4. 请为每个表格创建一个metadata.title，描述表格的主题
#             5. 请注意所有单元格值可以为null，不要留空值
#             """
            
#             # 创建内容
#             contents = [
#                 types.Part(text=page_instruction),
#                 types.Part(
#                     inline_data=types.Blob(
#                         mime_type="application/pdf",
#                         data=pdf_bytes
#                     )
#                 )
#             ]
            
#             # 使用API
#             page_response = client.models.generate_content(
#                 model="gemini-2.0-flash",
#                 contents=contents,
#                 config=types.GenerateContentConfig(
#                     response_mime_type="application/json"
#                 )
#             )
            
#             # 解析响应
#             page_text = page_response.text
#             page_data = json.loads(page_text)
            
#             # 确保page_data是列表
#             if not isinstance(page_data, list):
#                 if not page_data:  # 空响应
#                     continue
#                 page_data = [page_data]
            
#             if not page_data:  # 空列表，无表格
#                 print(f"第{page}页没有检测到表格")
#                 continue
            
#             # 处理页面上的每个表格
#             for table_json in page_data:
#                 try:
#                     # 清理数据并修正表格ID
#                     if "metadata" in table_json:
#                         table_json["metadata"]["table_id"] = f"table_{table_id_counter}"
#                         table_json["metadata"]["page_number"] = page
                    
#                     if "data" in table_json and "rows" in table_json["data"]:
#                         for row in table_json["data"]["rows"]:
#                             for cell in row:
#                                 if "value" not in cell:
#                                     cell["value"] = None
#                                 if "data_type" not in cell:
#                                     cell["data_type"] = "text"
                    
#                     # 使用Pydantic解析
#                     table = Table.model_validate(table_json)
                    
#                     # 如果表格没有标题，生成一个
#                     if not table.metadata.title:
#                         title = generate_table_title(client, table)
#                         table.metadata.title = title
#                         print(f"为表格 {table_id_counter} 生成标题: {title}")
                    
#                     tables.append(table)
#                     table_id_counter += 1
#                 except Exception as table_err:
#                     print(f"警告: 第{page}页表格解析失败: {str(table_err)}")
            
#             print(f"已处理第{page}页，找到 {len(page_data)} 个表格")
            
#         except Exception as page_err:
#             print(f"警告: 处理第{page}页时出错: {str(page_err)}")
#             # 如果连续3页没有表格，假设已到达PDF末尾
#             if page > 3 and len(tables) == 0:
#                 break
        
#         # 防止API限流
#         time.sleep(1)
    
#     # 创建完整的PdfTables对象
#     pdf_tables = PdfTables(
#         tables=tables,
#         total_tables=len(tables)
#     )
    
#     # 将解析后的所有表格保存为JSON
#     with open("parsed_tables.json", "w", encoding="utf-8") as f:
#         f.write(pdf_tables.model_dump_json(indent=2, exclude_none=True))
#     print(f"解析后的所有表格数据已保存到 parsed_tables.json")
    
#     # 将每个表格生成单独的HTML文件
#     os.makedirs("table_previews", exist_ok=True)
#     for idx, table in enumerate(tables):
#         html_content = table_to_html(table, idx+1)
#         filename = f"table_previews/table_{idx+1}.html"
#         with open(filename, "w", encoding="utf-8") as f:
#             f.write(html_content)
    
#     # 创建一个索引HTML文件，列出所有表格
#     create_index_html(tables)
#     print(f"表格HTML预览已保存到 table_previews/ 目录")
    
#     return pdf_tables

# def table_to_html(table: Table, index: int = 0) -> str:
#     """将表格转换为HTML格式"""
#     html = f"""
#     <!DOCTYPE html>
#     <html>
#     <head>
#         <title>表格 {index}: {table.metadata.title or '未命名表格'}</title>
#         <style>
#             body {{ font-family: Arial, sans-serif; margin: 20px; }}
#             h1 {{ color: #333; }}
#             table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
#             th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
#             th {{ background-color: #f2f2f2; }}
#             .metadata {{ background-color: #f9f9f9; padding: 10px; border-radius: 5px; margin-bottom: 20px; }}
#             .metadata p {{ margin: 5px 0; }}
#             .navigation {{ margin-top: 20px; }}
#             .navigation a {{ margin-right: 10px; padding: 5px 10px; background-color: #f0f0f0; text-decoration: none; color: #333; border-radius: 3px; }}
#         </style>
#     </head>
#     <body>
#         <div class="navigation">
#             <a href="index.html">返回表格列表</a>
#         </div>
        
#         <h1>表格 {index}: {table.metadata.title or '未命名表格'}</h1>
        
#         <div class="metadata">
#             <h3>元数据</h3>
#             <p><strong>表格ID:</strong> {table.metadata.table_id}</p>
#             <p><strong>PDF文件:</strong> {table.metadata.pdf_name}</p>
#             <p><strong>语言:</strong> {table.metadata.language}</p>
#             <p><strong>页码:</strong> {table.metadata.page_number or '未知'}</p>
#             <p><strong>总行数:</strong> {table.metadata.total_rows}</p>
#             <p><strong>总列数:</strong> {table.metadata.total_columns}</p>
#             {f'<p><strong>概述:</strong> {table.metadata.summary}</p>' if table.metadata.summary else ''}
#         </div>
        
#         <h3>表格数据</h3>
#         <table>
#             <thead>
#                 <tr>
#     """
    
#     # 添加表头
#     for header in table.data.headers:
#         html += f"<th>{header or ''}</th>"
    
#     html += """
#                 </tr>
#             </thead>
#             <tbody>
#     """
    
#     # 添加表格内容
#     for row in table.data.rows:
#         html += "<tr>"
#         for cell in row:
#             # 处理不同类型的单元格值
#             cell_value = cell.value if cell.value is not None else ""
#             html += f"<td>{cell_value}</td>"
#         html += "</tr>"
    
#     html += """
#             </tbody>
#         </table>
#     </body>
#     </html>
#     """
    
#     return html

# def create_index_html(tables: List[Table]) -> None:
#     """创建一个HTML索引页面，列出所有表格"""
#     html = """
#     <!DOCTYPE html>
#     <html>
#     <head>
#         <title>PDF表格提取结果</title>
#         <style>
#             body { font-family: Arial, sans-serif; margin: 20px; }
#             h1 { color: #333; }
#             .table-list { display: flex; flex-wrap: wrap; }
#             .table-card { 
#                 border: 1px solid #ddd;
#                 border-radius: 5px;
#                 padding: 15px;
#                 margin: 10px;
#                 width: 280px;
#                 background-color: #f9f9f9;
#             }
#             .table-card h3 { margin-top: 0; }
#             .table-card p { margin: 5px 0; }
#             .table-card a { 
#                 display: block;
#                 text-align: center;
#                 margin-top: 10px;
#                 padding: 8px;
#                 background-color: #4CAF50;
#                 color: white;
#                 text-decoration: none;
#                 border-radius: 3px;
#             }
#             .summary { margin-bottom: 20px; }
#         </style>
#     </head>
#     <body>
#         <h1>PDF表格提取结果</h1>
        
#         <div class="summary">
#             <p><strong>PDF文件:</strong> """ + (tables[0].metadata.pdf_name if tables else "未知") + """</p>
#             <p><strong>提取表格总数:</strong> """ + str(len(tables)) + """</p>
#         </div>
        
#         <h2>表格列表</h2>
#         <div class="table-list">
#     """
    
#     # 为每个表格创建一个卡片
#     for idx, table in enumerate(tables):
#         html += f"""
#         <div class="table-card">
#             <h3>表格 {idx+1}: {table.metadata.title or '未命名表格'}</h3>
#             <p><strong>表格ID:</strong> {table.metadata.table_id}</p>
#             <p><strong>页码:</strong> {table.metadata.page_number or '未知'}</p>
#             <p><strong>行数:</strong> {table.metadata.total_rows}</p>
#             <p><strong>列数:</strong> {table.metadata.total_columns}</p>
#             <a href="table_{idx+1}.html">查看表格</a>
#         </div>
#         """
    
#     html += """
#         </div>
#     </body>
#     </html>
#     """
    
#     # 保存索引HTML
#     with open("table_previews/index.html", "w", encoding="utf-8") as f:
#         f.write(html)

# # -------------------------- 
# # 使用示例
# # -------------------------- 
# if __name__ == "__main__":
#     # 读取PDF文件
#     with open("2.02_EN.pdf", "rb") as f:
#         pdf_bytes = f.read()
    
#     # 调用函数
#     try:
#         pdf_tables = parse_pdf_tables(pdf_bytes, "2.02_EN.pdf")
#         print(f"\n解析完成！共提取 {pdf_tables.total_tables} 个表格。")
#         print("请打开 table_previews/index.html 查看所有表格。")
#     except ValueError as e:
#         print(f"错误: {e}")

原始响应已保存到 raw_response.json
解析后的所有表格数据已保存到 parsed_tables.json
表格HTML预览已保存到 table_previews/ 目录

解析完成！共提取 6 个表格。
请打开 table_previews/index.html 查看所有表格。


In [104]:
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from typing import List, Union, Optional, Any, Dict
import json
import base64
import os
import time

# Define table cell data type - Modified value type to Any to accommodate possible None values
class CellValue(BaseModel):
    value: Optional[Any] = Field(description="Cell value, can be string, integer, float or null")
    data_type: str = Field(description="Data type: text, number, date, etc.")

# Define table metadata
class TableMetadata(BaseModel):
    table_id: str = Field(description="Unique identifier for the table")
    pdf_name: str = Field(description="PDF file name containing the table")
    summary: Optional[str] = Field(default=None, description="Table summary")
    title: Optional[str] = Field(default=None, description="Table title")
    language: str = Field(description="Language of table content, e.g.: de, fr, it, en")
    source_url: Optional[str] = Field(default=None, description="Source URL of the table")
    page_number: Optional[int] = Field(default=None, description="PDF page number")
    total_rows: int = Field(description="Total number of rows")
    total_columns: int = Field(description="Total number of columns")

# Define table data
class TableData(BaseModel):
    headers: List[Optional[str]] = Field(description="Table column headers")
    rows: List[List[CellValue]] = Field(description="Table data rows")

# Define complete table
class Table(BaseModel):
    metadata: TableMetadata = Field(description="Table metadata")
    data: TableData = Field(description="Table content data")

# Define list of multiple tables for the entire PDF processing result
class PdfTables(BaseModel):
    tables: List[Table] = Field(description="List of all tables in the PDF")
    total_tables: int = Field(description="Total number of tables")

def generate_table_title(client, table: Table) -> str:
    """Generate a descriptive title for the table using Gemini API"""
    try:
        # Prepare text representation of table data
        headers_text = ", ".join([h for h in table.data.headers if h])
        
        # Extract first few rows of data as examples
        sample_rows = []
        for row_idx, row in enumerate(table.data.rows):
            if row_idx >= 3:  # Only take first 3 rows
                break
            row_text = ", ".join([str(cell.value) for cell in row if cell.value is not None])
            sample_rows.append(row_text)
        
        rows_text = "\n".join(sample_rows)
        
        # Construct prompt
        prompt = f"""
        Please generate a short, accurate, and specific title for the following table that describes its content.
        Do not use generic words like "table", "data", etc. at the beginning of the title.
        
        Table headers: {headers_text}
        Table data samples:
        {rows_text}
        Page number: {table.metadata.page_number or 'Unknown'}
        
        Table title:
        """
        
        # Call API to generate title
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt,
            config=types.GenerateContentConfig(
                max_output_tokens=50,  # Limit output length
                temperature=0.2  # Reduce randomness
            )
        )
        
        # Extract and clean title
        title = response.text.strip()
        # Remove quotes if present
        if title.startswith('"') and title.endswith('"'):
            title = title[1:-1]
        if title.startswith("'") and title.endswith("'"):
            title = title[1:-1]
            
        return title if title else "Untitled Table"
    
    except Exception as e:
        print(f"Error generating table title: {str(e)}")
        return "Untitled Table"

def parse_pdf_tables(pdf_bytes: bytes, pdf_name: str) -> PdfTables:
    """Parse all tables in a PDF"""
    # Initialize client
    api_key = "AIzaSyA_ToQSQnQVA0eM2Pnjzi0Zz2c4utJk-TE"
    client = genai.Client(api_key=api_key)
    
    # Get JSON schema from Pydantic model
    table_schema = Table.model_json_schema()
    
    # Construct prompt: embed JSON Schema, emphasize extracting all tables
    instruction = f"""
    Please parse all tables in the PDF sequentially and return each table's data in the following JSON format.
    This PDF contains multiple pages and tables, please extract every table completely without omission.
    
    Table structure definition:
    {json.dumps(table_schema, indent=2)}
    
    Please return a JSON array, with each element representing a table, conforming to the above schema.
    
    Additional requirements:
    1. Each table's metadata.pdf_name must be {pdf_name}
    2. Each table's metadata.table_id should be incrementally numbered starting from "table_1"
    3. Each table's metadata.page_number must indicate the PDF page number where the table appears
    4. Please create a metadata.title for each table describing the table's subject
    5. Note that all cell values can be null, do not leave empty values
    6. Please extract all tables from all pages, do not miss any
    """
    
    # Create text part
    text_part = types.Part(text=instruction)
    
    # Create PDF part
    pdf_part = types.Part(
        inline_data=types.Blob(
            mime_type="application/pdf",
            data=pdf_bytes
        )
    )
    
    # Create content
    contents = [
        text_part,
        pdf_part
    ]
    
    # Use longer maximum output token count to ensure all tables can be returned
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=contents,
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            max_output_tokens=8192  # Increase output length limit
        )
    )
    
    # Parse response
    try:
        # Get response text
        response_text = response.text
        
        # Output raw response to file
        with open("raw_response.json", "w", encoding="utf-8") as f:
            f.write(response_text)
        print(f"Raw response saved to raw_response.json")
        
        # Validate that return is valid JSON
        json_data = json.loads(response_text)
        
        # Ensure json_data is a list
        if not isinstance(json_data, list):
            json_data = [json_data]
        
        # Initialize table list
        tables = []
        
        # Process each table
        for table_idx, table_json in enumerate(json_data):
            try:
                # Clean data to ensure no validation errors
                if "data" in table_json and "rows" in table_json["data"]:
                    for row_idx, row in enumerate(table_json["data"]["rows"]):
                        for cell_idx, cell in enumerate(row):
                            # Ensure each cell has value and data_type
                            if "value" not in cell:
                                cell["value"] = None
                            if "data_type" not in cell:
                                cell["data_type"] = "text"
                
                # Use Pydantic to parse individual table
                table = Table.model_validate(table_json)
                
                # Generate title if table doesn't have one
                if not table.metadata.title:
                    title = generate_table_title(client, table)
                    table.metadata.title = title
                    print(f"Generated title for table {table_idx+1}: {title}")
                
                tables.append(table)
            except Exception as e:
                print(f"Warning: Table {table_idx+1} parsing failed: {str(e)}")
        
        # Create complete PdfTables object
        pdf_tables = PdfTables(
            tables=tables,
            total_tables=len(tables)
        )
        
        # Save all parsed tables as JSON
        with open("parsed_tables.json", "w", encoding="utf-8") as f:
            f.write(pdf_tables.model_dump_json(indent=2, exclude_none=True))
        print(f"All parsed table data saved to parsed_tables.json")
        
        # Generate separate HTML file for each table
        os.makedirs("table_previews_f", exist_ok=True)
        for idx, table in enumerate(tables):
            html_content = table_to_html(table, idx+1)
            filename = f"table_previews_f/table_{idx+1}.html"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html_content)
        
        # Create an index HTML file listing all tables
        create_index_html(tables)
        print(f"Table HTML previews saved to table_previews_f/ directory")
        
        return pdf_tables
        
    except Exception as e:
        # If overall parsing fails, try batch processing
        print(f"Overall parsing failed, trying page-by-page processing: {str(e)}")
        return parse_pdf_tables_by_pages(client, pdf_bytes, pdf_name)

def parse_pdf_tables_by_pages(client, pdf_bytes: bytes, pdf_name: str) -> PdfTables:
    """Parse tables in PDF page by page, for handling large PDFs"""
    tables = []
    table_id_counter = 1
    
    # Try processing up to 30 pages
    for page in range(1, 31):
        try:
            # Construct prompt
            page_instruction = f"""
            Please parse all tables on page {page} of the PDF and return in JSON format.
            If there are no tables on this page, please return an empty array [].
            
            Additional requirements:
            1. Each table's metadata.pdf_name must be {pdf_name}
            2. Each table's metadata.table_id should be "table_{table_id_counter}"
            3. Each table's metadata.page_number must be {page}
            4. Please create a metadata.title for each table describing the table's subject
            5. Note that all cell values can be null, do not leave empty values
            """
            
            # Create content
            contents = [
                types.Part(text=page_instruction),
                types.Part(
                    inline_data=types.Blob(
                        mime_type="application/pdf",
                        data=pdf_bytes
                    )
                )
            ]
            
            # Use API
            page_response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=contents,
                config=types.GenerateContentConfig(
                    response_mime_type="application/json"
                )
            )
            
            # Parse response
            page_text = page_response.text
            page_data = json.loads(page_text)
            
            # Ensure page_data is a list
            if not isinstance(page_data, list):
                if not page_data:  # Empty response
                    continue
                page_data = [page_data]
            
            if not page_data:  # Empty list, no tables
                print(f"No tables detected on page {page}")
                continue
            
            # Process each table on the page
            for table_json in page_data:
                try:
                    # Clean data and fix table ID
                    if "metadata" in table_json:
                        table_json["metadata"]["table_id"] = f"table_{table_id_counter}"
                        table_json["metadata"]["page_number"] = page
                    
                    if "data" in table_json and "rows" in table_json["data"]:
                        for row in table_json["data"]["rows"]:
                            for cell in row:
                                if "value" not in cell:
                                    cell["value"] = None
                                if "data_type" not in cell:
                                    cell["data_type"] = "text"
                    
                    # Use Pydantic to parse
                    table = Table.model_validate(table_json)
                    
                    # Generate title if table doesn't have one
                    if not table.metadata.title:
                        title = generate_table_title(client, table)
                        table.metadata.title = title
                        print(f"Generated title for table {table_id_counter}: {title}")
                    
                    tables.append(table)
                    table_id_counter += 1
                except Exception as table_err:
                    print(f"Warning: Page {page} table parsing failed: {str(table_err)}")
            
            print(f"Processed page {page}, found {len(page_data)} tables")
            
        except Exception as page_err:
            print(f"Warning: Error processing page {page}: {str(page_err)}")
            # If 3 consecutive pages have no tables, assume we've reached the end of the PDF
            if page > 3 and len(tables) == 0:
                break
        
        # Prevent API throttling
        time.sleep(1)
    
    # Create complete PdfTables object
    pdf_tables = PdfTables(
        tables=tables,
        total_tables=len(tables)
    )
    
    # Save all parsed tables as JSON
    with open("parsed_tables.json", "w", encoding="utf-8") as f:
        f.write(pdf_tables.model_dump_json(indent=2, exclude_none=True))
    print(f"All parsed table data saved to parsed_tables.json")
    
    # Generate separate HTML file for each table
    os.makedirs("table_previews_f", exist_ok=True)
    for idx, table in enumerate(tables):
        html_content = table_to_html(table, idx+1)
        filename = f"table_previews_f/table_{idx+1}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(html_content)
    
    # Create an index HTML file listing all tables
    create_index_html(tables)
    print(f"Table HTML previews saved to table_previews_f/ directory")
    
    return pdf_tables

def table_to_html(table: Table, index: int = 0) -> str:
    """Convert table to HTML format"""
    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Table {index}: {table.metadata.title or 'Untitled Table'}</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            h1 {{ color: #333; }}
            table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            .metadata {{ background-color: #f9f9f9; padding: 10px; border-radius: 5px; margin-bottom: 20px; }}
            .metadata p {{ margin: 5px 0; }}
            .navigation {{ margin-top: 20px; }}
            .navigation a {{ margin-right: 10px; padding: 5px 10px; background-color: #f0f0f0; text-decoration: none; color: #333; border-radius: 3px; }}
        </style>
    </head>
    <body>
        <div class="navigation">
            <a href="index.html">Return to Table List</a>
        </div>
        
        <h1>Table {index}: {table.metadata.title or 'Untitled Table'}</h1>
        
        <div class="metadata">
            <h3>Metadata</h3>
            <p><strong>Table ID:</strong> {table.metadata.table_id}</p>
            <p><strong>PDF File:</strong> {table.metadata.pdf_name}</p>
            <p><strong>Language:</strong> {table.metadata.language}</p>
            <p><strong>Page Number:</strong> {table.metadata.page_number or 'Unknown'}</p>
            <p><strong>Total Rows:</strong> {table.metadata.total_rows}</p>
            <p><strong>Total Columns:</strong> {table.metadata.total_columns}</p>
            {f'<p><strong>Summary:</strong> {table.metadata.summary}</p>' if table.metadata.summary else ''}
        </div>
        
        <h3>Table Data</h3>
        <table>
            <thead>
                <tr>
    """
    
    # Add headers
    for header in table.data.headers:
        html += f"<th>{header or ''}</th>"
    
    html += """
                </tr>
            </thead>
            <tbody>
    """
    
    # Add table content
    for row in table.data.rows:
        html += "<tr>"
        for cell in row:
            # Handle different types of cell values
            cell_value = cell.value if cell.value is not None else ""
            html += f"<td>{cell_value}</td>"
        html += "</tr>"
    
    html += """
            </tbody>
        </table>
    </body>
    </html>
    """
    
    return html

def create_index_html(tables: List[Table]) -> None:
    """Create an HTML index page listing all tables"""
    html = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>PDF Table Extraction Results</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            h1 { color: #333; }
            .table-list { display: flex; flex-wrap: wrap; }
            .table-card { 
                border: 1px solid #ddd;
                border-radius: 5px;
                padding: 15px;
                margin: 10px;
                width: 280px;
                background-color: #f9f9f9;
            }
            .table-card h3 { margin-top: 0; }
            .table-card p { margin: 5px 0; }
            .table-card a { 
                display: block;
                text-align: center;
                margin-top: 10px;
                padding: 8px;
                background-color: #4CAF50;
                color: white;
                text-decoration: none;
                border-radius: 3px;
            }
            .summary { margin-bottom: 20px; }
        </style>
    </head>
    <body>
        <h1>PDF Table Extraction Results</h1>
        
        <div class="summary">
            <p><strong>PDF File:</strong> """ + (tables[0].metadata.pdf_name if tables else "Unknown") + """</p>
            <p><strong>Total Tables Extracted:</strong> """ + str(len(tables)) + """</p>
        </div>
        
        <h2>Table List</h2>
        <div class="table-list">
    """
    
    # Create a card for each table
    for idx, table in enumerate(tables):
        html += f"""
        <div class="table-card">
            <h3>Table {idx+1}: {table.metadata.title or 'Untitled Table'}</h3>
            <p><strong>Table ID:</strong> {table.metadata.table_id}</p>
            <p><strong>Page Number:</strong> {table.metadata.page_number or 'Unknown'}</p>
            <p><strong>Rows:</strong> {table.metadata.total_rows}</p>
            <p><strong>Columns:</strong> {table.metadata.total_columns}</p>
            <a href="table_{idx+1}.html">View Table</a>
        </div>
        """
    
    html += """
        </div>
    </body>
    </html>
    """
    
    # Save index HTML
    with open("table_previews_f/index.html", "w", encoding="utf-8") as f:
        f.write(html)

# -------------------------- 
# Usage example
# -------------------------- 
if __name__ == "__main__":
    # Read PDF file
    with open("2.02_f.pdf", "rb") as f:
        pdf_bytes = f.read()
    
    # Call function
    try:
        pdf_tables = parse_pdf_tables(pdf_bytes, "2.02_f.pdf")
        print(f"\nExtraction completed! Total tables extracted: {pdf_tables.total_tables}.")
        print("Please open table_previews_f/index.html to view all tables.")
    except ValueError as e:
        print(f"Error: {e}")

Raw response saved to raw_response.json
All parsed table data saved to parsed_tables.json
Table HTML previews saved to table_previews_f/ directory

Extraction completed! Total tables extracted: 7.
Please open table_previews_f/index.html to view all tables.


In [None]:
for model in client.models.list():
    print(model)

name='models/chat-bison-001' display_name='PaLM 2 Chat (Legacy)' description='A legacy text-only model optimized for chat conversations' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=4096 output_token_limit=1024 supported_actions=['generateMessage', 'countMessageTokens']
name='models/text-bison-001' display_name='PaLM 2 (Legacy)' description='A legacy model that understands text and generates text as an output' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=8196 output_token_limit=1024 supported_actions=['generateText', 'countTextTokens', 'createTunedTextModel']
name='models/embedding-gecko-001' display_name='Embedding Gecko' description='Obtain a distributed representation of a text.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, up