In [None]:
#查看ORC识别时是否有效去除页眉页脚。每一个新的PDF都用这个来看裁剪的参数

import fitz
import time
import os
from paddleocr import PaddleOCR
import numpy as np
from IPython.display import display
import ipywidgets as widgets

ocr = PaddleOCR(det=False, use_gpu=False, enable_mkldnn=True, use_tensorrt=True, use_angle_cls=True, lang='ch')

def pdf_to_jpg(name):
    pdfdoc = fitz.open(name)
    temp = 0
    page_index = 0
    
    def display_next_image(b):
        nonlocal page_index
        if page_index < pdfdoc.page_count:
            page = pdfdoc[page_index]
            rotate = int(0)
            rect = page.rect

            #裁剪范围
            crop_top = 45
            crop_bottom = 20
            crop_right = 10
            crop_left = 10

            #缩放比例
            zoom_x = 0.8
            zoom_y = 0.8
            rotate = int(0)
            trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
            crop = fitz.Rect(rect.x0 + crop_left, rect.y0 + crop_top, rect.x1 - crop_right, rect.y1 - crop_bottom)
            trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
            pm_cropped = page.get_pixmap(matrix=trans, alpha=False, clip=crop)
            pm_original = page.get_pixmap(matrix=trans, alpha=False)
            
            # 将 pixmap 转换为图像并显示
            img_cropped = widgets.Image(value=pm_cropped.tobytes(), format='png')
            img_original = widgets.Image(value=pm_original.tobytes(), format='png')
            
            output.clear_output(wait=True)
            with output:
                display(widgets.HBox([img_original, img_cropped]))
            page_index += 1
        else:
            output.clear_output(wait=True)
            with output:
                print("所有图片已显示完毕。")
    
    button = widgets.Button(description="显示下一张图片")
    output = widgets.Output()
    button.on_click(display_next_image)
    
    display(button, output)

pdf_to_jpg(r'./Part3/P3_16.pdf')

In [None]:
# 使用LlamaParse库进行pdf转txt,每个账号每天只能跑1000页。

import os
import fitz  # PyMuPDF
import nest_asyncio
from llama_parse import LlamaParse
from PyPDF2 import PdfWriter, PdfReader
import glob

# 定义裁剪区域，例如去除页眉和页脚各50像素。数值可以是负数。只要能跑就行
crop_top = 25
crop_bottom = 10
crop_right = 10
crop_left = 10
shift_y = 15
shift_x = 5

# 这个是提高清晰度的。一般不要小于1。如果pdf字体过小可以适度同比增大x和y的缩放值。
zoom_x = 2.0
zoom_y = 2.0
rotate = int(0)

PDF_file_path = r'./Part3/P3_16.pdf'
#所有的中间产物文件存放的位置
output_file_path = r'./temp2'
output_txt_path = r'./part3_txt'
max_pages = 750
# doc = fitz.open(PDF_file_path)

#现在我们所有的临时文件都存在./temp目录下，这个是用来每次清理的。
def clear_pdf_files(directory):
    # 获取目录下所有PDF文件的路径
    if os.path.exists(directory) == False:
        os.mkdir(directory)
    pdf_files = glob.glob(os.path.join(directory, '*.pdf'))
    
    # 删除每个PDF文件
    for pdf_file in pdf_files:
        try:
            os.remove(pdf_file)
            print(f"Deleted {pdf_file}")
        except Exception as e:
            print(f"Error deleting {pdf_file}: {e}")

clear_pdf_files(output_file_path)

#剪切PDF文件，一份最大750页。分割后不推荐一次传输。因为一个api的上限就是1k。可以分次上传后再手动合并。
def split_pdf(input_path, max_pages, output_prefix):

    pdf_document = fitz.open(input_path)
    total_pages = len(pdf_document)

    if total_pages <= max_pages:
        print(f"PDF has {total_pages} pages, no need to split.")
        output_path = os.path.join(output_prefix, "temp.pdf")
        output_pdf = fitz.open()
        output_pdf.insert_pdf(pdf_document, from_page=0, to_page=total_pages)
        output_pdf.save(output_path)
        output_pdf.close()
        return

    # 计算需要分割的份数
    num_parts = (total_pages + max_pages - 1) // max_pages

    for part in range(num_parts):
        output_path = os.path.join(output_prefix, f"temp_part_{part + 1}.pdf")
        output_pdf = fitz.open()

        start_page = part * max_pages
        end_page = min(start_page + max_pages, total_pages)

        for page_num in range(start_page, end_page):
            output_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)

        output_pdf.save(output_path)
        output_pdf.close()

        print(f"Created {output_path} with pages {start_page + 1} to {end_page}")

#新的剪切pdf的方法。如果前面的无法剪切报错cropbox不在mediabox内时可以改用这个。这个产出的pdf文件大小特别大。现在最好先不要用了。。。
def crop_pdf_new(input_pdf, output_pdf, crop_left, crop_top, crop_right, crop_bottom):

    doc = fitz.open(input_pdf)
    
    new_doc = fitz.open()

    #这个值不要太大，最多就到2吧。因为这个方法是直接保存图片的，每个pdf都特别大，zoom过大pdf会过大，上传会出现问题。
    zoom_x = 2.0  # 你可以根据需要调整这个值
    zoom_y = 2.0  # 你可以根据需要调整这个值
    matrix = fitz.Matrix(zoom_x, zoom_y)

    for page in doc:
        rect = page.rect
        

        clip = fitz.Rect(
            rect.x0 + crop_left,
            rect.y0 + crop_top,
            rect.x1 - crop_right,
            rect.y1 - crop_bottom
        )
        
        pix = page.get_pixmap(matrix = matrix, clip=clip)
        pix.set_dpi(150, 150)  # 设置DPI，降低分辨率
        
        new_page = new_doc.new_page(width=pix.width, height=pix.height)
        
        new_page.insert_image(new_page.rect, pixmap=pix)
    
    new_doc.save(output_pdf)

#另一个新的剪切pdf的方法。和旧方法类似。
def crop_pdf_pypdf2(input_pdf, output_pdf, crop_left, crop_top, crop_right, crop_bottom, shift_x, shift_y):
    with open(input_pdf, 'rb') as file:
        reader = PdfReader(file)
        writer = PdfWriter()

        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page.mediabox.lower_left  = (crop_left + shift_x + shift_x, crop_bottom + shift_y)
            page.mediabox.lower_right  = (page.mediabox.width - crop_right + shift_x, crop_bottom + shift_y)
            page.mediabox.upper_left = (crop_left + shift_x, page.mediabox.height - crop_top + shift_y)
            page.mediabox.upper_Right = (page.mediabox.width - crop_right + shift_x, page.mediabox.height - crop_top + shift_y)
            writer.add_page(page)

        with open(output_pdf, 'wb') as output_file:
            writer.write(output_file)

#旧的剪切pdf的方法。
def crop_pdf(input_pdf, output_pdf, crop_left, crop_top, crop_right, crop_bottom):
    doc = fitz.open(input_pdf)
    for n, page in enumerate(doc):
        rect = page.rect
        media_box = page.mediabox
        # clip.x0 = max(clip.x0, media_box.x0)
        # clip.y0 = max(clip.y0, media_box.y0)
        # clip.x1 = min(clip.x1, media_box.x1)
        # clip.y1 = min(clip.y1, media_box.y1)

        # print(rect.x0, rect.y0, rect.x1, rect.y1)
        # print(media_box.x0, media_box.y0, media_box.x1, media_box.y1)
        if rect.x1 - rect.x0 < 10 or rect.y1 - rect.y0 < 10:
            print(f"Cannot read page {n+1}! Check it manually.")
            continue
        trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
        clip = fitz.Rect(max(rect.x0 + crop_left, rect.x0), max(rect.y0 + crop_top, rect.y0), min(rect.x1 - crop_right, rect.x1), min(rect.y1 - crop_bottom, rect.y1))

        # print(clip.x0, clip.y0, clip.x1, clip.y1)

        page.set_cropbox(clip)

        doc.save(output_pdf)

# nest_asyncio.apply()

API_KEY = 'llx-YWsF0qbwjBsQlhd5RVPqjivhSiRR69FZKEpg36sbxQnGU9vF'
# reference: https://cloud.llamaindex.ai/parse

parser = LlamaParse(
    api_key=API_KEY,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  # Optionally you can define a language, default=en
)

temp_pdf_path = os.path.join(os.path.join(output_file_path, "temp.pdf"))

# cropped_pdf_doc = crop_pdf(PDF_file_path, temp_pdf_path, crop_left, crop_top, crop_right, crop_bottom)
crop_pdf_pypdf2(PDF_file_path, temp_pdf_path, crop_left, crop_top, crop_right, crop_bottom, shift_x, shift_y)

split_pdf(temp_pdf_path, max_pages, output_file_path)

# documents = parser.load_data(temp_pdf_path)

# # 处理返回的文档
# with open(os.path.join(output_txt_path, PDF_file_path.split('.pdf')[0].split('/')[-1] + '_cropped.txt'), 'w+') as w:
    # for i in documents:
        # w.write(i.get_text())


In [16]:
from llama_parse import LlamaParse
import nest_asyncio

nest_asyncio.apply()
API_KEY = 'llx-YWsF0qbwjBsQlhd5RVPqjivhSiRR69FZKEpg36sbxQnGU9vF'
# reference: https://cloud.llamaindex.ai/parse

pdf_file = './temp2/temp_part_2.pdf'

parser = LlamaParse(
    api_key=API_KEY,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  # Optionally you can define a language, default=en
)
documents = parser.load_data(pdf_file)

#处理返回的文档
with open('./part3_txt/P3_16/temp_part_2_cropped.txt', 'w+') as w:
    for i in documents:
        w.write(i.get_text())

INFO 2024-07-04 15:12:23,540 _client.py:1773] HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id b041e78d-6c29-4b54-9242-f469fa3923a5


INFO 2024-07-04 15:12:25,495 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:28,464 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:31,442 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:34,389 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:37,368 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:40,335 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:12:55,156 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:12:58,117 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:01,080 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:04,026 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:06,996 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:09,978 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:13:24,823 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:27,798 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:30,767 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:33,750 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:36,733 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:39,693 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:13:54,632 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:13:57,639 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:00,618 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:03,640 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:06,615 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:09,609 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:14:24,441 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:27,425 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:30,387 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:33,382 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:36,382 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:39,358 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:14:54,233 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:14:57,328 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:00,280 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:03,252 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:06,221 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:09,184 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:15:24,018 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:26,974 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:29,960 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:32,946 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:35,947 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:38,942 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:15:53,981 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:56,997 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:15:59,958 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:02,932 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:05,912 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:08,924 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:16:23,711 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:26,664 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:29,648 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:32,647 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:35,610 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:38,577 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:16:53,396 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:56,366 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:16:59,338 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:02,302 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:05,281 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:08,247 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:17:23,244 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:26,223 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:29,203 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:32,167 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:35,144 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:38,103 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:17:52,976 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:55,938 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:17:58,902 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:01,914 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:04,870 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:07,821 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:18:22,621 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:25,586 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:28,555 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:31,512 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:34,476 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:37,461 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:18:52,378 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:55,339 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:18:58,295 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:01,266 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:04,279 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:07,259 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:19:22,131 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:25,077 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:28,068 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:31,032 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:34,035 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:37,011 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK

.

INFO 2024-07-04 15:19:51,767 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:54,720 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:19:57,684 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:20:00,639 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:20:03,610 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5 "HTTP/1.1 200 OK"
INFO 2024-07-04 15:20:04,146 _client.py:1773] HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b041e78d-6c29-4b54-9242-f469fa3923a5/result/markdown 