In [1]:
import os
import json
import base64
from PIL import Image
from app.ocr_functions import image_pil_to_data_url
from pydantic import BaseModel
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
credential = DefaultAzureCredential()
from azure.storage.blob import BlobServiceClient
import requests
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import (
    SystemMessage,
    UserMessage,
    TextContentItem,
    ImageContentItem,
    ImageUrl,
    ImageDetailLevel,
)

Test


In [2]:
system_prompt = "Your are an OCR client using your Vision Capabilities to perform your response and provide a clean and structured text without any notes."
user_prompt = """extract the text from the image and provide a clean and structured text. 
                your output format must be in markdown format, with the text extracted from the image. 
                output will be  in 'markdown' text with markdown rules like titles,headings,etc, nothing else.
                
                NOTES:
                1) in the output, don't write ```markdown``` or ```md``` or any other code block.
                
                2) don't add any notes from you, just out the text extracted from the image. without any additions
                
                3) Warning! Don't Add Notice section to tell me that the text is not clear or any other notes.

                4) even if the text is not clear, try to extract as much as possible, and don't provide any extra notes.
                
                5) output the raw text extracted from the image as well. this text will be used for embedding purposes. So, Make sure that you don't lose any essential layouts when extract as raw text.
                
                6) Extract the text in the language it is written in the image.
                
                7) I will provide you multiple images, extract the text from all the images and provide the output in the same format.

                """

def generate_request(path, image_url):
    """
    Generates a request for the Batch API to extract text from an image.

    Args:
        path (str): The absolute file path of the image (used as custom_id).
        image_url (str): The URL or base64-encoded data of the image.
        system_prompt (str): The system prompt for the task.
        user_prompt (str): The user prompt for the task.

    Returns:
        dict: A request in the required JSON format.
    """
    request = {
        "custom_id": path,  # Use the absolute file path as custom_id
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": "archivai-gpt-4o-batch",
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_url
                            }
                        }
                    ]
                }
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "TextExtractionResponse",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "markdown_text": {
                                "type": "string"
                            },
                            "raw_text": {
                                "type": "string"
                            }
                        },
                        "required": ["markdown_text", "raw_text"],
                        "additionalProperties": False
                    }
                }
            }
        }
    }
    return request

### Generate Requests from Resume Folder Only

In [12]:
resume_folder = "tobacco-dataset\Resume"
output_file = "batch_requests.jsonl"

with open(output_file, "w") as jsonl_file:
    for root, dirs, files in os.walk(resume_folder):
        for file in files:
            if file.endswith(".png") or file.endswith(".jpg"):
                image_path = os.path.join(root, file)
                image = Image.open(image_path)
                image_url = image_pil_to_data_url(image)
                request = generate_request(image_path, image_url)
                jsonl_file.write(json.dumps(request) + "\n")

### Generate Requests from all dataset using data-url
Note: This is not good approach because the request file will be large

In [None]:
root_floder = "tobacco-dataset"

output_file = "batch_requests.jsonl"

with open(output_file, 'w') as jsonl_file:
    for folder in os.listdir(root_floder):
        folder_path = os.path.join(root_floder, folder)
        if os.path.isdir(folder_path):
            for image in os.listdir(folder_path):
                image_path = os.path.join(folder_path, image)
                with Image.open(image_path) as img:
                    image_url = image_pil_to_data_url(img)
                    request = generate_request(image_path, image_url)
                    jsonl_file.write(json.dumps(request) + '\n')

### Generate Requests from dataset images-url
This is the best approachc to reduce the size of requests.jsonl

In [33]:
# Initialize the BlobServiceClient
keyvault_name = "vaultarchivai"
kv_uri = f"https://{keyvault_name}.vault.azure.net"
keys_client = SecretClient(vault_url=kv_uri, credential=credential)

connection_string = keys_client.get_secret("blob-connection-string").value
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Specify the container and directory
container_name = "tobacco-dataset"
directory_name = "Resume"

# Get the container client
container_client = blob_service_client.get_container_client(container_name)

# List all blobs in the specified directory
blob_list = container_client.list_blobs()
filterd_blob_list = [blob for blob in blob_list if not blob.name.startswith(directory_name)]
# Generate URLs for each blob
blob_urls = []
for blob in filterd_blob_list:
    blob_path = f"{blob.name}"
    blob_url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{container_name}/{blob.name}"
    tuple_obj = (blob_path, blob_url)
    blob_urls.append(tuple_obj)

In [38]:
output_file = "batch_requests_all-e-Resume.jsonl"
with open(output_file, "w") as jsonl_file:
    for blob_path, blob_url in blob_urls:
        request = generate_request(blob_path, blob_url)
        jsonl_file.write(json.dumps(request) + "\n")

### OCR Invoke

In [2]:
keyvault_name = "vaultarchivai"
kv_uri = f"https://{keyvault_name}.vault.azure.net"
keys_client = SecretClient(vault_url=kv_uri, credential=credential)

# Authenticate to Azure OpenAI
api_base = keys_client.get_secret("archivai-openai-base").value
api_key= keys_client.get_secret("archivaigpt4-key").value
deployment_name = 'archivaigpt4'
api_version = '2024-08-01-preview'
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    base_url=f"{api_base}/openai/deployments/{deployment_name}"
)

In [3]:
class Page(BaseModel):
    markdown_text: str
    raw_text: str
def ocr_invoke(client, system_prompt, user_prompt, url):
    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt
                },
                # Add one dict per image url
                (
                    {
                        "type": "image_url",
                        "image_url": {"url": url}
                    }
                )
            ],
        }]
    completion = client.beta.chat.completions.parse(
    model=deployment_name,
    messages=messages,
    response_format=Page,
    )
    return completion.choices[0].message.parsed

In [72]:
url = "https://filstrg.blob.core.windows.net/tobacco-dataset/ADVE/86122854.jpg"
result = ocr_invoke(system_prompt, user_prompt, url)

In [79]:
result.dict()['markdown_text']

'# Newport Pleasure!\n\n**FIRE IT UP!**\n\n---\n\n_Canoe Trip._\nGM 2 x 60\nMarch-April 1999\n\n---\n\n86122854'

### Upload folders to blob container


In [32]:
from tqdm import tqdm

# Upload folders to blob container
container_name = "tobacco-dataset"
root_folder = "tobacco-dataset"
sub_folders = ['News','Note', 'Report', 'scientific']

for folder in sub_folders:
    folder_path = os.path.join(root_folder, folder)
    files = os.listdir(folder_path)
    for file in tqdm(files, desc=f'Uploading {folder} files'):
        file_path = os.path.join(folder_path, file)
        blob_path = f"{folder}/{file}"
        with open(file_path, "rb") as data:
            container_client.upload_blob(name=blob_path, data=data)

Uploading News files: 100%|██████████| 189/189 [04:33<00:00,  1.45s/it]
Uploading Note files: 100%|██████████| 202/202 [03:25<00:00,  1.02s/it]
Uploading Report files: 100%|██████████| 266/266 [05:11<00:00,  1.17s/it]
Uploading scientific files: 100%|██████████| 262/262 [10:51<00:00,  2.49s/it]


### Merge 2 requests jsonl files: the Resume Requests, and the rest of folders

In [40]:
# Merge 2 jsonl files
file1 = "resume_batch_requests.jsonl"
file2 = "batch_requests_all-e-Resume.jsonl"

with open(file1, "r") as file:
    data1 = file.read()

with open(file2, "r") as file:
    data2 = file.read()

merged_data = data1 + data2 
with open("batch_requests_all.jsonl", "w") as file:
    file.write(merged_data)

### Merge 2 rersponses jsonl files

In [41]:
# Merge 2 jsonl files
file1 = "resume.jsonl"
file2 = "file-128233ea-7ab4-4a23-9c98-47112cc6e3e9.jsonl"

with open(file1, "r") as file:
    data1 = file.read()

with open(file2, "r") as file:
    data2 = file.read()

merged_data = data1 + data2 
with open("batch_responses_all.jsonl", "w") as file:
    file.write(merged_data)

In [44]:
import jsonlines
file1 = "batch_requests_all.jsonl"
file2 = "batch_responses_all.jsonl"
# Find custom_ids that are in file1 but not in file2
with jsonlines.open(file1) as reader:
    requests = [line for line in reader]

with jsonlines.open(file2) as reader:
    responses = [line for line in reader]

request_ids = {request["custom_id"] for request in requests}
response_ids = {response["custom_id"] for response in responses}

missing_ids = request_ids - response_ids

In [46]:
response_ids - request_ids

set()

### Build The DataFrame

In [70]:
def read_jsonl(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

# Load the JSONL files
requests_data = read_jsonl("batch_requests_all.jsonl")
responses_data = read_jsonl("batch_responses_all.jsonl")
responses_data[245]

{'custom_id': 'ADVE/86122854.jpg',
 'response': {'body': {'choices': [{'content_filter_results': {'hate': {'filtered': False,
       'severity': 'safe'},
      'self_harm': {'filtered': False, 'severity': 'safe'},
      'sexual': {'filtered': True, 'severity': 'medium'},
      'violence': {'filtered': False, 'severity': 'safe'}},
     'finish_reason': 'content_filter',
     'index': 0,
     'logprobs': None,
     'message': {'refusal': None, 'role': 'assistant', 'content': ''}}],
   'created': 1736052196,
   'id': 'chatcmpl-AmCloQ7iMAMlyliJpH5MXg2mM2JL4',
   'model': 'gpt-4o-2024-08-06',
   'object': 'chat.completion',
   'prompt_filter_results': [{'prompt_index': 0,
     'content_filter_result': {'jailbreak': {'filtered': False,
       'detected': False}}},
    {'prompt_index': 1,
     'content_filter_result': {'sexual': {'filtered': False,
       'severity': 'safe'},
      'violence': {'filtered': False, 'severity': 'safe'},
      'hate': {'filtered': False, 'severity': 'safe'},
    

In [27]:
import json
import pandas as pd
from tqdm import tqdm
# Function to read a JSONL file and return a list of dictionaries
def read_jsonl(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

# Load the JSONL files
requests_data = read_jsonl("batch_requests_all.jsonl")
responses_data = read_jsonl("batch_responses_all.jsonl")

# Create dictionaries to store the data
requests_dict = {}
responses_dict = {}

# Extract data from the requests JSONL file
for request in tqdm(requests_data, desc="Extracting data from requests"):
    custom_id = request["custom_id"]
    image_url = request["body"]["messages"][1]["content"][1]["image_url"]["url"]
    requests_dict[custom_id] = {"image_url": image_url}

# Extract data from the responses JSONL file
for response in tqdm(responses_data, desc="Extracting data from responses"):
    custom_id = response["custom_id"]
    try:
        content = json.loads(response["response"]["body"]["choices"][0]["message"]["content"])
    except:
        print("Content Filtered, Try to extract it again with Invoke")
        # Get the image url of that custom_id
        image_url = requests_dict[custom_id]["image_url"]
        # Invoke the function to extract the text
        try:
            content = ocr_invoke(system_prompt, user_prompt, image_url).dict()
            print(f"Extracted text from {custom_id}")
        except:
            print(f"Failed to extract text from {custom_id}")
            continue
    markdown_text = content["markdown_text"]
    raw_text = content["raw_text"]
    responses_dict[custom_id] = {"markdown_text": markdown_text, "raw_text": raw_text}

# Combine the data into a single list of dictionaries
combined_data = []
for custom_id, request_info in requests_dict.items():
    if custom_id in responses_dict:
        combined_data.append({
            "custom_id": custom_id,
            "image_url": request_info["image_url"],
            "markdown_text": responses_dict[custom_id]["markdown_text"],
            "raw_text": responses_dict[custom_id]["raw_text"]
        })

# Create a DataFrame
df = pd.DataFrame(combined_data)

# Save the DataFrame to a CSV file (optional)
df.to_csv("extracted_data.csv", index=False)

# Display the DataFrame
df.head()

Extracting data from requests: 100%|██████████| 3476/3476 [00:00<00:00, 869529.47it/s]
Extracting data from responses: 100%|██████████| 3476/3476 [00:00<00:00, 91025.68it/s]


Content Filtered, Try to extract it again with Invoke
Failed to extract text from ADVE/86122854.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from ADVE/87003967_87003968.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Email/2072705831.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Email/2064213091c.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Email/2082564294a.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Form/2501611413.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Letter/502266196_502266198.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from Note/2072654561.jpg
Content Filtered, Try to extract it again with Invoke
Failed to extract text from scientific/2028716785.jpg
Content Filtered, Try to extract it again with Invoke
F

Unnamed: 0,custom_id,image_url,markdown_text,raw_text
0,Resume/0000153377.jpg,https://filstrg.blob.core.windows.net/tobacco-...,"## Fitzmaurice, Mary Anne\n\n### Research Biol...","\nFitzmaurice, Mary Anne\nResearch Biologist\n..."
1,Resume/10036815_10036823.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## Name: \nPeter M. Howl...,"CURRICULUM VITAE\n\nName: Peter M. Howley, M.D..."
2,Resume/10087799_10087801.jpg,https://filstrg.blob.core.windows.net/tobacco-...,![Form No. 1a (For N.I.H. Continuation Grant a...,Form No. 1a (For N.I.H. Continuation Grant app...
3,Resume/10150247_10150256.jpg,https://filstrg.blob.core.windows.net/tobacco-...,### UNIVERSITY OF MIAMI\n\n#### CURRICULUM VIT...,UNIVERSITY OF MIAMI\n\nCURRICULUM VITAE\n\nSta...
4,Resume/11300115-0116.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## WILLIAM CARSON HINDS\...,CURRICULUM VITAE\n\nWILLIAM CARSON HINDS\n\nBo...


In [None]:
df.read_csv("extracted_data.csv")

In [28]:
df["class"] = df["custom_id"].apply(lambda x: x.split("/")[0])
df["class"].value_counts()

class
Memo          620
Email         596
Letter        566
Form          430
Report        265
scientific    258
ADVE          222
Note          200
News          188
Resume        120
Name: count, dtype: int64

In [29]:
df.head()

Unnamed: 0,custom_id,image_url,markdown_text,raw_text,class
0,Resume/0000153377.jpg,https://filstrg.blob.core.windows.net/tobacco-...,"## Fitzmaurice, Mary Anne\n\n### Research Biol...","\nFitzmaurice, Mary Anne\nResearch Biologist\n...",Resume
1,Resume/10036815_10036823.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## Name: \nPeter M. Howl...,"CURRICULUM VITAE\n\nName: Peter M. Howley, M.D...",Resume
2,Resume/10087799_10087801.jpg,https://filstrg.blob.core.windows.net/tobacco-...,![Form No. 1a (For N.I.H. Continuation Grant a...,Form No. 1a (For N.I.H. Continuation Grant app...,Resume
3,Resume/10150247_10150256.jpg,https://filstrg.blob.core.windows.net/tobacco-...,### UNIVERSITY OF MIAMI\n\n#### CURRICULUM VIT...,UNIVERSITY OF MIAMI\n\nCURRICULUM VITAE\n\nSta...,Resume
4,Resume/11300115-0116.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## WILLIAM CARSON HINDS\...,CURRICULUM VITAE\n\nWILLIAM CARSON HINDS\n\nBo...,Resume


### Generate Embeddings from raw_text

In [38]:
import cohere
co_embed = cohere.Client(
    api_key=os.getenv("AZURE_ML_COHERE_EMBED_CREDENTIAL"),
    base_url=os.getenv("AZURE_ML_COHERE_EMBED_ENDPOINT"),
)

In [41]:
# test the embedding
docs = df.head()["raw_text"].tolist()
embeddings = co_embed.embed(
    input_type="classification",
    texts=docs,
).embeddings
len(embeddings)

5

In [42]:
len(embeddings[0])

1024

### Draft

In [4]:
import aisuite as ai
ai_client = ai.Client()
ai_client.configure({"azure" : {
  "api_key": os.environ["AZURE_API_KEY"],
  "base_url": os.environ["AZURE_BASE_URL"]
}})

In [17]:
def vlm_ocr(endpoint, model_deployment, system_prompt, user_prompt, url, key="1cd46458334d4daca5799b36aeec95d2"):
    vlm_client = ChatCompletionsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
        headers={"azureml-model-deployment": model_deployment},
    )
    response = vlm_client.complete(
        messages=[
            SystemMessage(content=system_prompt),
            UserMessage(content=[
                TextContentItem(text=user_prompt),
                ImageContentItem(
                    image_url=ImageUrl.load(
                        image_file=url,
                        image_format='jpg',
                        detail=ImageDetailLevel.HIGH,
                    )
                )
            ])
        ],
        model=model_deployment
    )
    return response.choices[0].message.content


In [33]:
import tempfile
endpoint = "https://archivai-ai-service.services.ai.azure.com/models"
deployment_name = "Phi-3.5-vision-instruct"
url = "https://filstrg.blob.core.windows.net/tobacco-dataset/ADVE/86122854.jpg"

system_prompt = "Your are an OCR client using your Vision Capabilities to perform your response and provide a clean and structured text without any notes."

vlm_user_prompt = """extract the text from the image and provide a clean and structured text. 
                your output format must be in structred way format, with the text extracted from the image. 
                output will be  in 'markdown' text with markdown rules like titles,headings,etc, nothing else.
                
                NOTES:
                1) in the output, don't write ```markdown``` or ```md``` or any other code block.
                
                2) don't add any notes from you, just out the text extracted from the image. without any additions
                
                3) Warning! Don't Add Notice section to tell me that the text is not clear or any other notes.

                4) even if the text is not clear, try to extract as much as possible, and don't provide any extra notes.
                
                5) output the raw text extracted from the image as well. this text will be used for embedding purposes. So, Make sure that you don't lose any essential layouts when extract as raw text.
                
                6) Extract the text in the language it is written in the image.
                                
                7) Make the output in this format: text: <text extracted from the image>                
                """
image_data = requests.get(url).content
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file.write(image_data)
    temp_file_path = temp_file.name
response = vlm_ocr(endpoint, deployment_name, system_prompt, vlm_user_prompt, temp_file_path)