In [1]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

In [2]:
import base64

In [3]:
endpoint = "Your cognitive service endpoint"
key = "Your congitive service key"

In [4]:
document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [5]:
pdf_path = '../pdf/modelcard.pdf'

In [6]:
with open(pdf_path, "rb") as f:
    base64_encoded_pdf = base64.b64encode(f.read()).decode("utf-8")

In [7]:
analyze_request = {
        "base64Source": base64_encoded_pdf
}

In [8]:
poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", analyze_request=analyze_request
)

In [9]:
result = poller.result()

In [10]:
result

{'apiVersion': '2024-07-31-preview', 'modelId': 'prebuilt-layout', 'stringIndexType': 'textElements', 'content': '2024-10-07\nmodel_card.md\nPhi-3.5-Mini-Instruct ONNX models\nThis repository hosts the optimized versions of Phi-3.5-mini-instruct to accelerate inference with ONNX Runtime. Optimized Phi-3.5 Mini models are published here in ONNX format to run with ONNX Runtime on CPU and GPU across devices, including server platforms, Windows, Linux and Mac desktops, and mobile CPUs, with the precision best suited to each of these targets.\nTo easily get started with Phi-3.5, you can use our newly introduced ONNX Runtime Generate() API. See here for instructions on how to run it.\nONNX Models\nHere are some of the optimized configurations we have added:\n1. ONNX model for fp16 CUDA: ONNX model you can use to run for your NVIDIA GPUs.\n2. ONNX model for int4 CUDA: ONNX model for NVIDIA GPUs using int4 quantization via AWQ.\n3. ONNX model for int4 CPU and Mobile: ONNX model for CPU and mob

In [11]:
def convert_table_to_markdown(table):
    # 获取表格的最大行和列索引
    max_row = max(cell.row_index for cell in table.cells) + 1
    max_col = max(cell.column_index for cell in table.cells) + 1

    # 初始化表格
    markdown_table = [["" for _ in range(max_col)] for _ in range(max_row)]

    # 填充表格
    for cell in table.cells:
        markdown_table[cell.row_index][cell.column_index] = cell.content

    # 转换为 Markdown 格式
    markdown_str = ""
    for row in markdown_table:
        markdown_str += "| " + " | ".join(row) + " |\n"
        markdown_str += "| " + " | ".join(["---"] * len(row)) + " |\n"
        break  # 只添加一次分隔行
    for row in markdown_table[1:]:
        markdown_str += "| " + " | ".join(row) + " |\n"

    return markdown_str

In [12]:
import json

page = 0
md_content = ""

tables = []

if result.tables:
    for table_idx, table in enumerate(result.tables):
        if table.bounding_regions:
            for region in table.bounding_regions:
                page = region.page_number
        md_content = convert_table_to_markdown(table)
        json_output = json.dumps({"page_number": page, "md_content": md_content}, ensure_ascii=False)
        print(json_output)
        tables.append(json_output)

    print("----------------------------------------")


{"page_number": 3, "md_content": "| Batch Size, Sequence Length | ONNX RT INT4 | PyTorch Eager INT4 | PyTorch Compile INT4 | Llama.cpp INT4 | INT4 SpeedUp ORT/PyTorch Eager | INT4 SpeedUp ORT/PyTorch Compile | INT4 SpeedUp ORT/Llama.cpp |\n| --- | --- | --- | --- | --- | --- | --- | --- |\n| 1, 16 | 238.97 | 17.75 | 11.36 | 183.17 | 13.46 | 21.04 | 1.30 |\n| 1, 64 | 233.74 | 17.74 | 11.32 | 182.77 | 13.17 | 20.65 | 1.28 |\n| 1, 256 | 208.52 | 17.82 | 11.34 | 182.15 | 11.70 | 18.38 | 1.14 |\n| 1, 1024 | 174.19 | 17.85 | 11.36 | 166.39 | 9.76 | 15.34 | 1.05 |\n| 1, 2048 | 146.10 | 17.96 | 11.35 | 153.50 | 8.14 | 12.87 | 0.95 |\n| 1, 3840 | 112.68 | 17.91 | 11.34 | 141.53 | 6.29 | 9.94 | 0.80 |\n| 4, 16 | 286.73 | 60.90 | 40.89 | 180.82 | 4.71 | 7.01 | 1.59 |\n| 4, 64 | 282.87 | 60.88 | 41.03 | 177.69 | 4.65 | 6.89 | 1.59 |\n| 4, 256 | 268.30 | 60.85 | 40.90 | 166.34 | 4.41 | 6.56 | 1.61 |\n| 4, 1024 | 223.30 | 60.86 | 40.90 | 133.39 | 3.67 | 5.46 | 1.67 |\n| 4, 2048 | 187.62 | 60.80 | 40

In [13]:
tables

['{"page_number": 3, "md_content": "| Batch Size, Sequence Length | ONNX RT INT4 | PyTorch Eager INT4 | PyTorch Compile INT4 | Llama.cpp INT4 | INT4 SpeedUp ORT/PyTorch Eager | INT4 SpeedUp ORT/PyTorch Compile | INT4 SpeedUp ORT/Llama.cpp |\\n| --- | --- | --- | --- | --- | --- | --- | --- |\\n| 1, 16 | 238.97 | 17.75 | 11.36 | 183.17 | 13.46 | 21.04 | 1.30 |\\n| 1, 64 | 233.74 | 17.74 | 11.32 | 182.77 | 13.17 | 20.65 | 1.28 |\\n| 1, 256 | 208.52 | 17.82 | 11.34 | 182.15 | 11.70 | 18.38 | 1.14 |\\n| 1, 1024 | 174.19 | 17.85 | 11.36 | 166.39 | 9.76 | 15.34 | 1.05 |\\n| 1, 2048 | 146.10 | 17.96 | 11.35 | 153.50 | 8.14 | 12.87 | 0.95 |\\n| 1, 3840 | 112.68 | 17.91 | 11.34 | 141.53 | 6.29 | 9.94 | 0.80 |\\n| 4, 16 | 286.73 | 60.90 | 40.89 | 180.82 | 4.71 | 7.01 | 1.59 |\\n| 4, 64 | 282.87 | 60.88 | 41.03 | 177.69 | 4.65 | 6.89 | 1.59 |\\n| 4, 256 | 268.30 | 60.85 | 40.90 | 166.34 | 4.41 | 6.56 | 1.61 |\\n| 4, 1024 | 223.30 | 60.86 | 40.90 | 133.39 | 3.67 | 5.46 | 1.67 |\\n| 4, 2048 | 187.6

In [14]:
table_dict = json.loads(tables[0])
table_dict["md_content"]

'| Batch Size, Sequence Length | ONNX RT INT4 | PyTorch Eager INT4 | PyTorch Compile INT4 | Llama.cpp INT4 | INT4 SpeedUp ORT/PyTorch Eager | INT4 SpeedUp ORT/PyTorch Compile | INT4 SpeedUp ORT/Llama.cpp |\n| --- | --- | --- | --- | --- | --- | --- | --- |\n| 1, 16 | 238.97 | 17.75 | 11.36 | 183.17 | 13.46 | 21.04 | 1.30 |\n| 1, 64 | 233.74 | 17.74 | 11.32 | 182.77 | 13.17 | 20.65 | 1.28 |\n| 1, 256 | 208.52 | 17.82 | 11.34 | 182.15 | 11.70 | 18.38 | 1.14 |\n| 1, 1024 | 174.19 | 17.85 | 11.36 | 166.39 | 9.76 | 15.34 | 1.05 |\n| 1, 2048 | 146.10 | 17.96 | 11.35 | 153.50 | 8.14 | 12.87 | 0.95 |\n| 1, 3840 | 112.68 | 17.91 | 11.34 | 141.53 | 6.29 | 9.94 | 0.80 |\n| 4, 16 | 286.73 | 60.90 | 40.89 | 180.82 | 4.71 | 7.01 | 1.59 |\n| 4, 64 | 282.87 | 60.88 | 41.03 | 177.69 | 4.65 | 6.89 | 1.59 |\n| 4, 256 | 268.30 | 60.85 | 40.90 | 166.34 | 4.41 | 6.56 | 1.61 |\n| 4, 1024 | 223.30 | 60.86 | 40.90 | 133.39 | 3.67 | 5.46 | 1.67 |\n| 4, 2048 | 187.62 | 60.80 | 40.93 | 106.03 | 3.09 | 4.58 | 1.77

In [15]:
import os
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

In [16]:
endpoint = "https://models.inference.ai.azure.com"
model_name = "Phi-3.5-mini-instruct"
token =  'Your Github Model token'

In [17]:
client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
)

In [18]:
response = client.complete(
    messages=[
        SystemMessage(content="""You are my markdown table assistant, who can understand all the contents of the table and give analysis."""),
        UserMessage(content=table_dict["md_content"]),
    ],
    temperature=1.0,
    top_p=1.0,
    max_tokens=1000,
    model=model_name
)

In [19]:
response.choices[0].message.content

' The table presents benchmark results for different runtime inference tasks using various configurations and frameworks. The Batch Size and Sequence Length parameters vary from 1 to 3840, with sequence lengths ranging from 16 to 3840. The table compares execution times (in seconds) for five different frameworks/tools: ONNX Runtime (INT4), PyTorch (Eager and Compile), and Llama.cpp.\n\nHere are some observations from the table:\n\n1. As the Batch Size and Sequence Length increase, execution time generally decreases, indicating improved throughput due to parallelization.\n\n2. ONNX Runtime presents a more consistent decrease in execution time compared to PyTorch (Eager and Compile). This consistency might suggest better optimization and scalability in ONNX.\n\n3. PyTorch (Eager) and Compile modes show similar INT4 speedup values. Both approaches benefit from increased batch sizes but display lower performance than ONNX at higher batch sizes, potentially due to data transfer overhead or 