In [1]:
from pprint import pprint
from paddleocr import PaddleOCR
import requests
import json


def req_chat(prompt, content,  history=None, api_url='http://127.0.0.1:6006/api/chat'):
    # 请求数据
    data = {
        "prompt": prompt,
        "content": content,
        "history": [],  # 如果需要，可以传递历史对话记录
    }

    # 设置请求头
    headers = {
        "Content-Type": "application/json"
    }

    # 发送 POST 请求
    try:
        # log
        open('log.txt', 'a').write(f'prompt: {prompt}, content: "{content}"\n')
        response = requests.post(
            api_url, headers=headers, data=json.dumps(data))
        response.raise_for_status()  # 检查是否有 HTTP 错误

        # 解析返回数据
        response_data = response.json()['response']
        return response_data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


def ocr2content(img_path):
    # need to run only once to download and load model into memory
    ocr = PaddleOCR(lang='ch')
    content = ""
    result = ocr.ocr(img_path, cls=False)
    for idx in range(len(result)):
        res = result[idx]
        for (bbox, (text, prob)) in res:
            bbox_str = str(bbox)
            text_str = str(text)
            content += f"{bbox_str} {text_str}\n"
    return content


ocr_chat_prompt = """
#### 背景信息：
您需要从发票图像的文本识别结果中提取以下关键信息。以下是目标字段和说明：
- InvoiceNum: 发票号。
- InvoiceDate: 开票日期。
- NoteDrawer: 开票人名称。
- TotalAmount: 发票总金额。
- TotalTax: 发票总税额。
- PurchaserName: 购买方名称。
- PurchaserRegisterNum: 购买方纳税人识别号。
- SellerName: 销售方名称。
- SellerRegisterNum: 销售方纳税人识别号。
- CommodityDetails: 包括发票中商品详情。
输入是由 OCR 输出的文字和坐标列表，如下：
```
[([[x1, y1], [x2, y2], [x3, y3], [x4, y4]], "文本内容")]
```
其中，第一个元素是文本框的四个顶点坐标，第二个元素是识别出的文本内容。
#### 输出要求：
根据输入的位置信息和具体的字段，提取所需信息，输出如下结构化字典：
```
fields = {
    "InvoiceNum": "24422000000113017597",
    "InvoiceDate": "2024年08月30日",
    "NoteDrawer": "开票人姓名",
    "TotalAmount": "1000.00",
    "TotalTax": "100.00",
    "TotalAmountWithTax": "1100.00",
    "SellerName": "销售方名称",
    "SellerRegisterNum": "123456789012345",
    "PurchaserName": "购买方名称",
    "PurchaserRegisterNum": "987654321098765",
    "CommodityDetails": [
        {"Name": "商品1", "Quantity": "2", "UnitPrice": "50.00", "Amount": "100.00"},
        {"Name": "商品2", "Quantity": "1", "UnitPrice": "900.00", "Amount": "900.00"}
    ]
}
```
"""

ocr_chat_prompt_o = """

提取所需信息并填入相应字段，严格按格式输出如下json字典，不要输出多余的信息：
fields = {
    "发票号": "24422000000113017597",
    "开票日期": "2024年08月30日",
    "开票人名称": "张三",
    "发票总金额": "1000.00",
    "发票总税额": "100.00",
    "价税合计": "1100.00",
    "购买方名称。": "购买方名称",
    "购买方纳税人识别号。": "987654321098765",
    "销售方名称": "销售方名称",
    "销售方纳税人识别号。": "123456789012345",
    "商品详情": [
        {"Name": "商品1", "Quantity": "2", "UnitPrice": "50.00", "Amount": "100.00"},
        {"Name": "商品2", "Quantity": "1", "UnitPrice": "900.00", "Amount": "900.00"}
    ]
}
"""


def ocr_chat(img_path):
    content = ocr2content(img_path)
    response = req_chat(content=content, prompt=ocr_chat_prompt_o)
    return response




In [2]:

img_path = './whu/test2/3.jpg'
response = ocr_chat(img_path)
pprint(response)


[2024/11/28 12:26:46] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/aliancn/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/aliancn/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_tex

In [6]:
import json
from pprint import pprint

def extract_json_from_response(response):
    # 提取 JSON 数据部分
    start_idx = response.find('fields = {')
    if start_idx == -1:
        raise ValueError("No JSON data found in the response")

    # 找到 JSON 数据的结束位置
    end_idx = response.find('}', start_idx) + 1
    json_str = response[start_idx + len('fields = '):end_idx]

    # 将 JSON 字符串转换为 Python 字典
    fields = json.loads(json_str)
    return fields


In [12]:
responsewww = (
    'fields = {\n'
    '    "发票号": "24427200000076461843",\n'
    '    "开票日期": "2024年09月20日",\n'
    '    "开票人名称": "王梅",\n'
    '    "购买方名称": "武汉大学",\n'
    '    "购买方纳税人识别号": "12100000701737123P",\n'
    '    "销售方名称": "武汉京东金德贸易有限公司",\n'
    '    "销售方纳税人识别号": "12100000701737123P",\n'
    '    "发票总金额": "998.06",\n'
    '    "发票总税额": "114.93",\n'
    '    "价税合计": "998.99",\n'
    '    "商品详情": [\n'
    '        {\n'
    '            "Name": "计算机外部设备*联想",\n'
    '            "Quantity": "2",\n'
    '            "UnitPrice": "884.07",\n'
    '            "Amount": "884.07",\n'
    '            "TaxRate": "13%",\n'
    '            "TaxAmount": "114.93"\n'
    '        }\n'
    '    ]\n'
    '}'
)

# 提取并打印 JSON 数据
fields = extract_json_from_response(responsewww)
pprint(fields)

JSONDecodeError: Expecting ',' delimiter: line 20 column 10 (char 506)

In [6]:
file_path = './whu/image.pdf'
import os
poppler_path = "/opt/homebrew/bin"
os.environ['PATH'] += ':/opt/homebrew/bin/pdftoppm'  # 替换为你的 Poppler 安装路径
if file_path.endswith('.pdf'):
    print(f"Converting PDF to image: {file_path}")
    from pdf2image import convert_from_path
    images = convert_from_path(file_path, poppler_path=poppler_path)
    if images:
        image_path = file_path.replace('.pdf', '.jpg')
        images[0].save(image_path, 'JPEG')
        print(f"PDF successfully converted to image: {image_path}")
    else:
        print(f"Failed to convert PDF to image: {file_path}")

Converting PDF to image: ./whu/image.pdf
PDF successfully converted to image: ./whu/image.jpg
