In [None]:
%%capture
!pip install boto3
!pip install Levenshtein
!pip install python-Levenshtein
!pip install azure-cognitiveservices-vision-computervision boto3
!apt-get install -y tesseract-ocr
!pip install pytesseract
!pip install google-cloud-vision
!pip install -qU langchain-google-genai

In [None]:
import os
import zipfile
import json
import random
import time
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials
import requests
import boto3
from difflib import SequenceMatcher
import logging
import openai
from openai import OpenAI
import Levenshtein
import pytesseract
from PIL import Image
from langchain_google_genai import ChatGoogleGenerativeAI
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
folder = "/content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT"

The 10,000 images are ordered in the training set such that: each consecutive 1000 images contain text of one main language (and it may of course contain additional text from 1 or 2 other languages, all from the set of the 10 languages)
00001 - 01000:  Arabic
01001 - 02000:  English
02001 - 03000:  French
03001 - 04000:  Chinese
04001 - 05000:  German
05001 - 06000:  Korean
06001 - 07000:  Japanese
07001 - 08000:  Italian
08001 - 09000:  Bangla
09001 - 10000:  Hindi


In [None]:
# Language ranges
languages = {
    "Arabic": (1, 1000),
    "English": (1001, 2000),
    "French": (2001, 3000),
    "Chinese": (3001, 4000),
    "German": (4001, 5000),
    "Korean": (5001, 6000),
    "Japanese": (6001, 7000),
    "Italian": (7001, 8000),
    "Bangla": (8001, 9000),
    "Hindi": (9001, 10000),
}

# Maximum file size (1MB)
max_size = 1 * 1024 * 1024

In [None]:
indices = [0, 3, 5,6,9]

hard_languages = []
for i in indices:
    hard_languages.append(list(languages.keys())[i])

In [None]:
hard_languages

['Arabic', 'Chinese', 'Korean', 'Japanese', 'Hindi']

In [None]:





image_paths_by_language = {}

# Loop over each language
for lang, (start, end) in languages.items():
    print(f"Processing {lang}...")


    # Generate file paths for the range
    all_files = [
        os.path.join(folder, f"tr_img_{i:05d}.jpg")
        for i in range(start, end + 1)
    ]

    # Filter files by size and existence
    valid_files = [
        file for file in all_files
        if os.path.exists(file) and os.path.getsize(file) < max_size
    ]

    # Take the first 50 images
    selected_files = valid_files[:50]

    # Copy files to the language folder
    image_paths_by_language[lang] = selected_files

print("Image paths by language dictionary created successfully.")

Processing Arabic...
Processing English...
Processing French...
Processing Chinese...
Processing German...
Processing Korean...
Processing Japanese...
Processing Italian...
Processing Bangla...
Processing Hindi...
Image paths by language dictionary created successfully.


In [None]:
list(languages.keys())[0:5]

['Arabic', 'English', 'French', 'Chinese', 'German']

In [None]:
list(languages.keys())[5:10]

['Korean', 'Japanese', 'Italian', 'Bangla', 'Hindi']

In [None]:
def estrai_parole_da_file(percorso_file):
    parole = []
    with open(percorso_file, 'r') as file:
        for riga in file:
            # Extract the last word or quoted word after the last comma
            match = re.search(r',\s*"?([^",]+)"?\s*$', riga.strip())
            if match:
                transcription = match.group(1)
                # Ignore transcriptions that are ###
                if transcription != "###":
                    parole.append(transcription)
    return parole

In [None]:
percorso_file = "/content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_00001.txt"

# Estrazione delle parole
parole_estratte = estrai_parole_da_file(percorso_file)
print(parole_estratte)

['CT', 'تليفون:٠١٣٨٩٠٤٠٦٦', 'Systems', 'Division', 'Okaz', 'Security', 'O', 'Telecom', 'HIKVISION', 'ViRDI', 'ZKSoftware', '8904066', 'Tel:013', 'عكاظ', 'للإتصالات', 'E-Guard', 'ELECTRONIC', 'GUARD', 'قسم', 'الأنظمة', 'الأمنية', 'C']


In [None]:
output_file = "/content/drive/MyDrive/image_paths_by_language.json"

In [None]:


# Save the dictionary to a JSON file
with open(output_file, "w") as f:
    json.dump(image_paths_by_language, f, indent=4)

print(f"Dictionary saved to {output_file}")

Dictionary saved to /content/drive/MyDrive/image_paths_by_language.json


In [None]:

with open(output_file, "r") as f:
    image_paths_by_language = json.load(f)

In [None]:
len(image_paths_by_language)

10

In [None]:
image_paths_by_language.keys()

dict_keys(['Arabic', 'English', 'French', 'Chinese', 'German', 'Korean', 'Japanese', 'Italian', 'Bangla', 'Hindi'])

# LLM Initializitation

In [None]:
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [None]:
prompt_template='''You are an advanced language model specialized in text correction. You will receive text extracted from images using OCR (Optical Character Recognition) models. Your task is to identify and correct any misspelled words or inaccuracies while preserving the context and intended meaning of the text.
### Instructions:
1. Correct spelling errors: Replace any misspelled words with the correct ones.
2. Maintain context: Ensure that the corrected text aligns with the overall meaning and structure of the original input.
3. Handle OCR-specific errors:
   - Fix common OCR mistakes such as incorrect substitutions of similar-looking characters (e.g., "rn" misread as "m").
   - Handle mixed-case errors, such as "tHiS iS" to "This is."
4. Do not alter proper nouns, numbers, or special characters unless they are obviously incorrect.
5. Do not add or remove punctuations

Output the corrected text clearly and concisely.

### Example Inputs and Outputs:

**Input:**
Th1s 1s an exarnple of OCR t3xt.

**Output:**
This is an example of OCR text.

**Input:**
Th@ rn0del w1ll c0rrect err0rs.

**Output:**
The model will correct errors.

Focus on accuracy and consistency. Your goal is to produce clean and coherent text that closely resembles the intended content. Only output the corrected text and nothing else'''

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            prompt_template,
        ),
        ("human", "{input}"),
    ]
)

In [None]:
from langchain_core.output_parsers.string import StrOutputParser

In [None]:
model=prompt|llm|StrOutputParser()


In [None]:
# Functions for Metrics
def calculate_levenshtein_accuracy(extracted_text, ground_truth):
    extracted_text = extracted_text.strip().lower()
    ground_truth = ground_truth.strip().lower()
    distance = Levenshtein.distance(extracted_text, ground_truth)
    max_length = max(len(extracted_text), len(ground_truth))
    return (1 - distance / max_length) * 100 if max_length > 0 else 0

def calculate_word_accuracy(expected_words, ocr_words):
    expected_set = set(word.lower() for word in expected_words)
    ocr_set = set(word.lower() for word in ocr_words)
    matching_words = expected_set & ocr_set
    return (len(matching_words) / len(expected_set)) * 100 if expected_set else 0

def calculate_character_error_rate(extracted_text, ground_truth):
    errors = Levenshtein.distance(extracted_text, ground_truth)
    total_chars = len(ground_truth)
    return (errors / total_chars) * 100 if total_chars > 0 else 0

# Evaluation Pipeline
def evaluate_ocr(ocr_dict):
    """
    Evaluate OCR output against ground truth for each entry in a dictionary.

    Parameters:
        ocr_dict (dict): Dictionary with keys as image names and values as OCR outputs.
        type: 'scene' o 'document'

    Returns:
        dict: A dictionary with image names as keys and evaluation metrics as values.
    """
    evaluation_results = {}

    for image_name, ocr_output in ocr_dict.items():

        #ground_truth_file = f"{folder}/{os.path.splitext(image_name)[0]}.json"
        ground_truth_file=os.path.join(folder, image_name.rsplit('.', 1)[0] + '.txt')

      #output_json_path = os.path.join(folder, image_name.rsplit('.', 1)[0] + '_ocr.json')



        # Assuming ground truth JSON has a key "text" containing the expected output
        #ground_truth = " ".join(item["text"] for item in ground_truth_data if "text" in item)

        ground_truth_words = estrai_parole_da_file(ground_truth_file)
        ground_truth_text=" ".join(item for item in ground_truth_words)
        lev_acc_baseline = calculate_levenshtein_accuracy(ocr_output, ground_truth_text)
        word_acc_baseline = calculate_word_accuracy(ground_truth_words, ocr_output.split())
        cer_baseline = calculate_character_error_rate(ocr_output, ground_truth_text)

        #LLM
        llm_output=model.invoke({'input':ocr_output})
        time.sleep(5)#Con il piano gratuito, possiamo fare solo 15 run al minuto
        lev_acc_llm = calculate_levenshtein_accuracy(llm_output, ground_truth_text)
        word_acc_llm = calculate_word_accuracy(ground_truth_words, llm_output.split())
        cer_llm = calculate_character_error_rate(llm_output, ground_truth_text)



        # Store the results in the evaluation dictionary
        evaluation_results[os.path.splitext(image_name)[0]] = {
            "Levenshtein Accuracy": lev_acc_baseline,
            "Word Accuracy": word_acc_baseline,
            "Character Error Rate": cer_baseline,
            "Levenshtein Accuracy LLM": lev_acc_llm,
            "Word Accuracy LLM": word_acc_llm,
            "Character Error Rate LLM": cer_llm

        }
    return evaluation_results

## AZURE

In [None]:
subscription_key = AZURE_API_KEY
endpoint = "https://patrikbaldon.cognitiveservices.azure.com/"
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))

In [None]:
azure_extraction = {}

# Funzione per estrarre testo e aggiungerlo al dizionario
def extract_text_azure(image_path):
    with open(image_path, "rb") as image_file:
        read_response = computervision_client.read_in_stream(image_file, raw=True)
        operation_location = read_response.headers["Operation-Location"]
        operation_id = operation_location.split("/")[-1]

        # Attendi il completamento
        while True:
            result = computervision_client.get_read_result(operation_id)
            if result.status not in ['notStarted', 'running']:
                break
            time.sleep(1)

        # Estrai il testo
        extracted_text = ""
        if result.status == OperationStatusCodes.succeeded:
            for page in result.analyze_result.read_results:
                for line in page.lines:
                    extracted_text += line.text + " "

        # Salva nel dizionario
        azure_extraction[os.path.basename(image_path)] = extracted_text.strip()

In [None]:
#Azure_Evaluation = {}
for lang in list(languages.keys())[5:10]:
  images_paths = image_paths_by_language[lang]
  azure_extraction = {}
  print(f"Extracting text from {lang} images")
  for image_path in images_paths:
    extract_text_azure(image_path)
  print(f"Evaluating model on {lang} images")
  Azure_Evaluation[lang] = evaluate_ocr(azure_extraction)


Extracting text from Korean images
Evaluating model on Korean images
Extracting text from Japanese images
Evaluating model on Japanese images
Extracting text from Italian images
Evaluating model on Italian images
Extracting text from Bangla images
Evaluating model on Bangla images
Extracting text from Hindi images
Evaluating model on Hindi images


In [None]:
len(Azure_Evaluation)

10

In [None]:
len(Azure_Evaluation['German'])

50

In [None]:
output_file = "/content/drive/MyDrive/Azure_Evaluations.json"

# Save the dictionary to a JSON file
with open(output_file, "w") as f:
    json.dump(Azure_Evaluation, f, indent=4)

print(f"Dictionary saved to {output_file}")

Dictionary saved to /content/drive/MyDrive/Azure_Evaluations.json


In [None]:
azure_file="/content/drive/MyDrive/Azure_Evaluations.json"
with open(azure_file, "r") as f:
    Azure_Evaluation = json.load(f)

In [None]:
keys = list(Azure_Evaluation['Arabic'].keys())
values = list(Azure_Evaluation['Arabic'].values())

print(f"First record:\nKey: {keys[0]}\nValue: {values[0]}")

First record:
Key: tr_img_00001
Value: {'Levenshtein Accuracy': 26.630434782608692, 'Word Accuracy': 77.27272727272727, 'Character Error Rate': 82.53012048192771, 'Levenshtein Accuracy LLM': 28.260869565217394, 'Word Accuracy LLM': 77.27272727272727, 'Character Error Rate LLM': 80.72289156626506}


# AWS

In [None]:
file_names={}
for lang in languages.keys():
  file_names[lang] = [os.path.splitext(os.path.basename(path))[0] for path in images_paths_by_language[lang]]
print(file_names['Arabic'][0:10])

In [None]:
def list_images_in_folder(bucket,language):
    session = boto3.Session(
        aws_access_key_id=AWS_API_KEY,
        aws_secret_access_key=AWS_SECRET,
        region_name='us-east-1'
    )
    s3 = session.client('s3')
    response = s3.list_objects_v2(Bucket=bucket, Prefix="ICDAR 2019 MLT/")

    # Dimensione massima in bytes (1 MB)
    max_size = 1 * 1024 * 1024  # 1 MB

    # Ottieni e ordina le immagini per nome
    sorted_objects = sorted(
        [obj for obj in response.get('Contents', []) if obj['Key'].endswith(('.jpg', '.png', '.jpeg'))],
        key=lambda x: x['Key']
    )

    # Filtra immagini per dimensione
    return [
        (obj['Key'], obj['Key'].split('/')[-1].split('.')[0])
        for obj in sorted_objects
        if (obj['Key'].split('/')[-1].split('.')[0] in file_names[language])
    ]


In [None]:
def extract_text_textract(bucket, image_name):
    session = boto3.Session(
        aws_access_key_id=AWS_API_KEY,
        aws_secret_access_key=AWS_SECRET,
        region_name='us-east-1'
    )
    textract = session.client('textract')
    response = textract.detect_document_text(
        Document={'S3Object': {'Bucket': bucket, 'Name': image_name}}
    )
    extracted_text = []
    for block in response['Blocks']:
        if block['BlockType'] == 'LINE':
            extracted_text.append(block['Text'])
    return " ".join(extracted_text)

In [None]:
# Configurazione AWS
region_name = "us-east-1"
bucket_name = "patriksbucket"

aws_extraction = {}


files = list_images_in_folder(bucket_name,lang)


for path, image_name in files[10:20]:
  # Estrai testo dall'immagine
  aws_extraction[image_name] = extract_text_textract(bucket_name, path)

In [None]:
region_name = "us-east-1"
bucket_name = "patriksbucket"

Aws_Evaluation = {}
for lang in languages.keys():
  images_paths = list_images_in_folder(bucket_name,lang)
  aws_extraction = {}
  print(f"Extracting text from {lang} images")
  for path, image_name in images_paths:
    aws_extraction[image_name] = extract_text_textract(bucket_name, path)
  print(f"Evaluating model on {lang} images")
  Aws_Evaluation[lang] = evaluate_ocr(aws_extraction)

In [None]:
Aws_Evaluation = evaluate_ocr(aws_extraction,'document')

In [None]:
keys = list(Aws_Evaluation['Arabic'].keys())
values = list(Aws_Evaluation['Arabic'].values())

print(f"First record:\nKey: {keys[0]}\nValue: {values[0]}")

First record:
Key: tr_img_00011
Value: {'Levenshtein Accuracy': 36.92307692307693, 'Word Accuracy': 33.33333333333333, 'Character Error Rate': 63.07692307692307, 'Levenshtein Accuracy LLM': 36.92307692307693, 'Word Accuracy LLM': 25.0, 'Character Error Rate LLM': 63.07692307692307}


In [None]:
aws_extraction['tr_img_00011']

'asijasi will Fell Citron Violette Lavande 5789'

# OCR_SPACE

In [None]:
#folder="/content/test_data/test"

In [None]:
API_KEY = OCR_SPACE_API_KEY
ocrspace_extraction = {}

In [None]:


def ocr_space_file(image_path, output_path=None, overlay=False, api_key=None, language='eng'):
    """
    OCR.space API request with a local file, optionally saving results as JSON.

    Parameters:
        image_path (str): Path to the file.
        output_path (str, optional): Path to save the resulting JSON file.
        overlay (bool, optional): Include OCR overlay in the response.
        api_key (str): Your OCR.space API key.
        language (str): Language code for OCR.

    Returns:
        str: Extracted text from the image or None if an error occurred.
    """
    payload = {
        'isOverlayRequired': overlay,
        'apikey': api_key,
        'language': language,
        'OCREngine': 2
    }

    try:
        # Open the image file in binary mode
        with open(image_path, 'rb') as f:
            r = requests.post(
                'https://api.ocr.space/parse/image',
                files={os.path.basename(image_path): f},
                data=payload,
            )
        result = r.json()

        # Check if the response contains errors
        #if result.get("IsErroredOnProcessing", False):
        #    print(f"Error processing {image_path}: {result.get('ErrorMessage', 'Unknown error')}")
        #    return None

        # Extract the text
        return result["ParsedResults"][0]["ParsedText"].replace('\n', ' ')

    except Exception as e:
        print(f"Exception occurred for {image_path}: {e}")
        return None


In [None]:
hard_languages[3:5]

['Japanese', 'Hindi']

In [None]:
output_file = "/content/drive/MyDrive/ocrspace_Evaluations.json"
with open(output_file, "r") as f:
    OcrSpace_Evaluation = json.load(f)

In [None]:
len(OcrSpace_Evaluation)

3

In [None]:
#OcrSpace_Evaluation = {}
for lang in hard_languages[3:5]:
  images_paths = image_paths_by_language[lang]
  ocrspace_extraction = {}
  print(f"Extracting text from {lang} images")
  for image_path in images_paths:
    text=ocr_space_file(image_path, api_key=OCR_SPACE_KEY)
    time.sleep(2)
    if text:
      ocrspace_extraction[os.path.basename(image_path)] =text

  print(f"Evaluating model on {lang} images")
  OcrSpace_Evaluation[lang] = evaluate_ocr(ocrspace_extraction)

Extracting text from Japanese images
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06001.jpg: list index out of range
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06004.jpg: list index out of range
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06006.jpg: Expecting value: line 1 column 1 (char 0)
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06013.jpg: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06015.jpg: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Exception occurred for /content/drive/MyDrive/vision_datasets/ICDAR 2019 MLT/tr_img_06016.jpg: HTTPSConnectionPool(host='api.ocr.space', port=443): Max retries exceeded with url: /parse/image (Caused by ConnectTimeoutError(<urllib3.connecti

In [None]:
output_file = "/content/drive/MyDrive/ocrspace_Evaluations.json"

# Save the dictionary to a JSON file
with open(output_file, "w") as f:
    json.dump(OcrSpace_Evaluation, f, indent=4)

print(f"Dictionary saved to {output_file}")

Dictionary saved to /content/drive/MyDrive/ocrspace_Evaluations.json


In [None]:
len(OcrSpace_Evaluation)

5

In [None]:
keys = list(OcrSpace_Evaluation['Arabic'].keys())
values = list(OcrSpace_Evaluation['Arabic'].values())

print(f"First record:\nKey: {keys[0]}\nValue: {values[0]}")

First record:
Key: tr_img_00011
Value: {'Levenshtein Accuracy': 38.46153846153846, 'Word Accuracy': 91.66666666666666, 'Character Error Rate': 61.53846153846154, 'Levenshtein Accuracy LLM': 37.878787878787875, 'Word Accuracy LLM': 91.66666666666666, 'Character Error Rate LLM': 63.07692307692307}


In [None]:
ocrspace_extraction['tr_img_00011.jpg']

'Fell الفل الليمون Citron البنفسج Violette 5 7 الخزامة Lavande 8 9'

# GOOGLE CLOUD VISION API


In [None]:
cloudvisionapikey = CLOUD_VISION_API_KEY
cloudvision_extraction = {}

In [None]:
from google.cloud import vision
from google.auth.transport.requests import Request
from google.auth.credentials import AnonymousCredentials
import requests
import base64 # Import base64

def detect_text(image_path):
    """
    Detect text in an image using the Vision API with an API key.

    Args:
        api_key (str): Your Google Cloud Vision API key.
        image_path (str): Path to the image file.

    Returns:
        str: Detected text.
    """
    # Read the image file
    with open(image_path, "rb") as image_file:
        image_content = image_file.read()

    # Encode image content to base64
    image_content_base64 = base64.b64encode(image_content).decode('utf-8')

    # Create the request payload
    url = f"https://vision.googleapis.com/v1/images:annotate?key={cloudvisionapikey}"
    headers = {"Content-Type": "application/json"}
    payload = {
        "requests": [
            {
                "image": {"content": image_content_base64}, # Use base64 encoded content
                "features": [{"type": "TEXT_DETECTION"}]
            }
        ]
    }

    # Send the request
    response = requests.post(url, json=payload, headers=headers)
    response.raise_for_status()

    # Parse the response
    result = response.json()
    if "responses" in result and "textAnnotations" in result["responses"][0]:
        detected_text = result["responses"][0]["textAnnotations"][0]["description"]
        return detected_text
    else:
        return "No text detected."

In [None]:
OcrCloudVision_Evaluation = {}
for lang in hard_languages:
  images_paths = image_paths_by_language[lang]
  cloudvision_extraction = {}
  print(f"Extracting text from {lang} images")
  for image_path in images_paths:
    cloudvision_extraction[os.path.basename(image_path)] = detect_text(image_path)
  print(f"Evaluating model on {lang} images")
  OcrCloudVision_Evaluation[lang] = evaluate_ocr(cloudvision_extraction)

Extracting text from Arabic images
Evaluating model on Arabic images
Extracting text from Chinese images
Evaluating model on Chinese images
Extracting text from Korean images
Evaluating model on Korean images
Extracting text from Japanese images
Evaluating model on Japanese images
Extracting text from Hindi images
Evaluating model on Hindi images


In [None]:
output_file = "/content/drive/MyDrive/Cloudvision_Evaluations.json"

# Save the dictionary to a JSON file
with open(output_file, "w") as f:
    json.dump(OcrCloudVision_Evaluation, f, indent=4)

print(f"Dictionary saved to {output_file}")

Dictionary saved to /content/drive/MyDrive/Cloudvision_Evaluations.json


In [None]:
cloud_file="/content/drive/MyDrive/Cloudvision_Evaluations.json"
with open(cloud_file, "r") as f:
    OcrCloudVision_Evaluation = json.load(f)

In [None]:
keys = list(OcrCloudVision_Evaluation['Arabic'].keys())
values = list(OcrCloudVision_Evaluation['Arabic'].values())

print(f"First record:\nKey: {keys[0]}\nValue: {values[0]}")

First record:
Key: tr_img_00011
Value: {'Levenshtein Accuracy': 27.692307692307693, 'Word Accuracy': 58.333333333333336, 'Character Error Rate': 72.3076923076923, 'Levenshtein Accuracy LLM': 27.692307692307693, 'Word Accuracy LLM': 58.333333333333336, 'Character Error Rate LLM': 72.3076923076923}


In [None]:
cloudvision_extraction['tr_img_00011.jpg']

'Fell\nالفل\nالليمون\nCitron\nالبنفسج\nViolette\nالخزامة\nLavande\n789\n67'

# TESSERACT

In [None]:
tesseract_extraction = {}

In [None]:
print(pytesseract.get_languages(config=''))

['eng', 'osd']


In [None]:
# Function to extract text using Tesseract OCR
def extract_text_tesseract(image_path):
  text = pytesseract.image_to_string(Image.open(image_path),lang='eng+ara')
  return text.strip()

In [None]:
for image_path in images_paths[10:20]:
  tesseract_extraction[os.path.basename(image_path)] = extract_text_tesseract(image_path)

In [None]:
Tesseract_Evaluation = evaluate_ocr(tesseract_extraction, 'document')

In [None]:
keys = list(Tesseract_Evaluation.keys())
values = list(Tesseract_Evaluation.values())

print(f"First record:\nKey: {keys[0]}\nValue: {values[0]}")

First record:
Key: tr_img_00011
Value: {'Levenshtein Accuracy': 0.0, 'Word Accuracy': 0.0, 'Character Error Rate': 100.0, 'Levenshtein Accuracy LLM': 13.846153846153841, 'Word Accuracy LLM': 0.0, 'Character Error Rate LLM': 87.6923076923077}


In [None]:
tesseract_extraction['tr_img_00011.jpg']

''

# COMPARE THE EVALUATIONS

In [None]:
def calculate_score(metrics, alpha, beta, gamma):

    lev_accuracy = metrics.get("Levenshtein Accuracy", 0)
    word_accuracy = metrics.get("Word Accuracy", 0)
    cer = metrics.get("Character Error Rate", 0)

    total_score = alpha * lev_accuracy + beta * word_accuracy - gamma * cer
    return total_score


def evaluate_all_metrics(evaluations, alpha, beta, gamma):

    best_api = None
    best_score = float('-inf')

    print("API Scores:\n")

    for api_name, image_metrics in evaluations.items():
        total_score = 0
        num_images = len(image_metrics)
        for image_name, metrics in image_metrics.items():
            total_score += calculate_score(metrics, alpha, beta, gamma)
        avg_score = total_score / num_images if num_images > 0 else 0

        print(f"{api_name} - Average Score: {avg_score:.2f}")

        if avg_score > best_score:
            best_score = avg_score
            best_api = api_name

    return best_api, best_score

In [None]:
# Esempio di dati delle metriche
evaluations = {
    "Azure OCR": Azure_Evaluation,
    #"AWS Textract": Aws_Evaluation,
    "OCR.space": OcrSpace_Evaluation,
    "Google Cloud Vision": OcrCloudVision_Evaluation,
    #"Tesseract": Tesseract_Evaluation
}

# Confronta le metriche per tutte le API
best_api, best_score = evaluate_all_metrics(evaluations, alpha=0.01, beta=1, gamma=0.01)
print(f"\nThe best OCR API is {best_api} with an average score of {best_score:.2f}.")


API Scores:

Azure OCR - Average Score: 13.88
AWS Textract - Average Score: -44.99
OCR.space - Average Score: -1.28
Google Cloud Vision - Average Score: 14.28
Tesseract - Average Score: -119.28

The best OCR API is Google Cloud Vision with an average score of 14.28.


# COMPARE THE EVALUATIONS

In [None]:
import pandas as pd

In [None]:
def calculate_score(metrics, alpha, beta, gamma):

    lev_accuracy = metrics.get("Levenshtein Accuracy", 0)
    word_accuracy = metrics.get("Word Accuracy", 0)
    cer = metrics.get("Character Error Rate", 0)

    total_score = alpha * lev_accuracy + beta * word_accuracy - gamma * cer
    return total_score

def calculate_score_llm(metrics, alpha, beta, gamma):

    lev_accuracy = metrics.get("Levenshtein Accuracy LLM", 0)
    word_accuracy = metrics.get("Word Accuracy LLM", 0)
    cer = metrics.get("Character Error Rate LLM", 0)

    total_score = alpha * lev_accuracy + beta * word_accuracy - gamma * cer
    return total_score


def evaluate_all_metrics(evaluations, alpha, beta, gamma):

    best_api = None
    best_score = float('-inf')
    results=[]

    print("API Scores:\n")

    for api_name, image_metrics in evaluations.items():
        total_score = 0
        total_score_llm = 0
        total_lev_accuracy = 0
        total_word_accuracy = 0
        total_cer = 0
        total_lev_accuracy_llm = 0
        total_word_accuracy_llm = 0
        total_cer_llm = 0
        num_images = len(image_metrics)
        for image_name, metrics in image_metrics.items():
            total_score += calculate_score(metrics, alpha, beta, gamma)
            total_score_llm += calculate_score_llm(metrics, alpha, beta, gamma)

            total_lev_accuracy += metrics.get("Levenshtein Accuracy", 0)
            total_word_accuracy += metrics.get("Word Accuracy", 0)
            total_cer += metrics.get("Character Error Rate", 0)

            total_lev_accuracy_llm += metrics.get("Levenshtein Accuracy LLM", 0)
            total_word_accuracy_llm += metrics.get("Word Accuracy LLM", 0)
            total_cer_llm += metrics.get("Character Error Rate LLM", 0)
        avg_score = total_score / num_images if num_images > 0 else 0
        avg_score_llm = total_score_llm / num_images if num_images > 0 else 0

        avg_lev_accuracy = total_lev_accuracy / num_images if num_images > 0 else 0
        avg_word_accuracy = total_word_accuracy / num_images if num_images > 0 else 0
        avg_cer = total_cer / num_images if num_images > 0 else 0

        avg_lev_accuracy_llm = total_lev_accuracy_llm / num_images if num_images > 0 else 0
        avg_word_accuracy_llm = total_word_accuracy_llm / num_images if num_images > 0 else 0
        avg_cer_llm = total_cer_llm / num_images if num_images > 0 else 0

        print(f"{api_name} - Average Score: {avg_score:.2f}, with LLM: {avg_score_llm:.2f}")

        results.append({
            "API Name": api_name,
            "Levenshtein Accuracy": avg_lev_accuracy,
            "Levenshtein Accuracy with LLM": avg_lev_accuracy_llm,
            "Word Accuracy": avg_word_accuracy,
            "Word Accuracy with LLM": avg_word_accuracy_llm,
            "Character Error Rate": avg_cer,
            "Character Error Rate with LLM": avg_cer_llm,
            "Average Score": avg_score,
            "Average Score with LLM": avg_score_llm
        })


        if avg_score > best_score:
            best_score = avg_score
            best_api = api_name


    df = pd.DataFrame(results)
    df.to_excel("multilingual_ocr_evaluation_results.xlsx", index=False)

    print("\nResults saved to excel file")
    return best_api, best_score




In [None]:
def evaluate_all_metrics(evaluations, alpha, beta, gamma, hard_languages):
    """
    Evaluates metrics for OCR APIs and generates Excel files for each hard language.

    Args:
        evaluations (dict): A dictionary where keys are API names, and values are dictionaries with
                            languages as keys and evaluation results (metrics for each image).
        alpha (float): Weight for Levenshtein Accuracy.
        beta (float): Weight for Word Accuracy.
        gamma (float): Weight for Character Error Rate.
        hard_languages (list): List of hard languages to process.

    Returns:
        dict: A summary of best API and score for each hard language.
    """
    best_api_per_language = {}

    for language in hard_languages:
        results = []
        best_score = float('-inf')
        best_api = None

        print(f"Processing evaluations for {language}...\n")

        for api_name, lang_data in evaluations.items():
            if language not in lang_data:
                continue  # Skip if the language is not in the API's data

            image_metrics = lang_data[language]
            total_score = 0
            total_score_llm = 0
            total_lev_accuracy = 0
            total_word_accuracy = 0
            total_cer = 0
            total_lev_accuracy_llm = 0
            total_word_accuracy_llm = 0
            total_cer_llm = 0
            num_images = len(image_metrics)

            for image_name, metrics in image_metrics.items():
                total_score += (
                    alpha * metrics.get("Levenshtein Accuracy", 0) +
                    beta * metrics.get("Word Accuracy", 0) +
                    gamma * (1 - metrics.get("Character Error Rate", 0))
                )

                total_score_llm += (
                    alpha * metrics.get("Levenshtein Accuracy LLM", 0) +
                    beta * metrics.get("Word Accuracy LLM", 0) +
                    gamma * (1 - metrics.get("Character Error Rate LLM", 0))
                )

                total_lev_accuracy += metrics.get("Levenshtein Accuracy", 0)
                total_word_accuracy += metrics.get("Word Accuracy", 0)
                total_cer += metrics.get("Character Error Rate", 0)

                total_lev_accuracy_llm += metrics.get("Levenshtein Accuracy LLM", 0)
                total_word_accuracy_llm += metrics.get("Word Accuracy LLM", 0)
                total_cer_llm += metrics.get("Character Error Rate LLM", 0)

            avg_score = total_score / num_images if num_images > 0 else 0
            avg_score_llm = total_score_llm / num_images if num_images > 0 else 0

            avg_lev_accuracy = total_lev_accuracy / num_images if num_images > 0 else 0
            avg_word_accuracy = total_word_accuracy / num_images if num_images > 0 else 0
            avg_cer = total_cer / num_images if num_images > 0 else 0

            avg_lev_accuracy_llm = total_lev_accuracy_llm / num_images if num_images > 0 else 0
            avg_word_accuracy_llm = total_word_accuracy_llm / num_images if num_images > 0 else 0
            avg_cer_llm = total_cer_llm / num_images if num_images > 0 else 0

            print(f"{api_name} - {language}: Average Score: {avg_score:.2f}, with LLM: {avg_score_llm:.2f}")

            results.append({
                "API Name": api_name,
                "Levenshtein Accuracy": avg_lev_accuracy,
                "Levenshtein Accuracy with LLM": avg_lev_accuracy_llm,
                "Word Accuracy": avg_word_accuracy,
                "Word Accuracy with LLM": avg_word_accuracy_llm,
                "Character Error Rate": avg_cer,
                "Character Error Rate with LLM": avg_cer_llm,
                "Average Score": avg_score,
                "Average Score with LLM": avg_score_llm
            })

            if avg_score > best_score:
                best_score = avg_score
                best_api = api_name

        # Save results to Excel
        df = pd.DataFrame(results)
        file_name = f"{language}_ocr_evaluation_results.xlsx"
        df.to_excel(file_name, index=False)
        print(f"Results for {language} saved to {file_name}\n")

        # Store the best API for this language
        best_api_per_language[language] = {"Best API": best_api, "Best Score": best_score}

    return best_api_per_language



In [None]:
hard_languages

['Arabic', 'Chinese', 'Korean', 'Japanese', 'Hindi']

In [None]:
Azure_Evaluation

{'Arabic': {'tr_img_00001': {'Levenshtein Accuracy': 26.630434782608692,
   'Word Accuracy': 77.27272727272727,
   'Character Error Rate': 82.53012048192771,
   'Levenshtein Accuracy LLM': 28.260869565217394,
   'Word Accuracy LLM': 77.27272727272727,
   'Character Error Rate LLM': 80.72289156626506},
  'tr_img_00002': {'Levenshtein Accuracy': 95.65217391304348,
   'Word Accuracy': 80.0,
   'Character Error Rate': 9.090909090909092,
   'Levenshtein Accuracy LLM': 95.65217391304348,
   'Word Accuracy LLM': 80.0,
   'Character Error Rate LLM': 9.090909090909092},
  'tr_img_00003': {'Levenshtein Accuracy': 51.162790697674424,
   'Word Accuracy': 66.66666666666666,
   'Character Error Rate': 48.837209302325576,
   'Levenshtein Accuracy LLM': 46.51162790697675,
   'Word Accuracy LLM': 66.66666666666666,
   'Character Error Rate LLM': 53.48837209302325},
  'tr_img_00004': {'Levenshtein Accuracy': 100.0,
   'Word Accuracy': 100.0,
   'Character Error Rate': 0.0,
   'Levenshtein Accuracy LLM':

In [None]:
# Esempio di dati delle metriche
evaluations = {
    "Azure OCR": Azure_Evaluation,
    #"AWS Textract": Aws_Evaluation,
    "OCR.space": OcrSpace_Evaluation,
    #"Tesseract": Tesseract_Evaluation,
    "Google Cloud Vision API": OcrCloudVision_Evaluation
}

evaluate_all_metrics(evaluations, alpha=0.01, beta=1, gamma=0.01,hard_languages=hard_languages)



Processing evaluations for Arabic...

Azure OCR - Arabic: Average Score: 77.05, with LLM: 68.83
OCR.space - Arabic: Average Score: 62.15, with LLM: 57.58
Google Cloud Vision API - Arabic: Average Score: 71.76, with LLM: 62.32
Results for Arabic saved to Arabic_ocr_evaluation_results.xlsx

Processing evaluations for Chinese...

Azure OCR - Chinese: Average Score: 69.33, with LLM: 52.83
OCR.space - Chinese: Average Score: 45.43, with LLM: 40.63
Google Cloud Vision API - Chinese: Average Score: 63.66, with LLM: 55.36
Results for Chinese saved to Chinese_ocr_evaluation_results.xlsx

Processing evaluations for Korean...

Azure OCR - Korean: Average Score: 65.67, with LLM: 61.76
OCR.space - Korean: Average Score: 60.15, with LLM: 56.14
Google Cloud Vision API - Korean: Average Score: 64.17, with LLM: 59.85
Results for Korean saved to Korean_ocr_evaluation_results.xlsx

Processing evaluations for Japanese...

Azure OCR - Japanese: Average Score: 67.13, with LLM: 53.71
OCR.space - Japanese: Av

{'Arabic': {'Best API': 'Azure OCR', 'Best Score': 77.05279453007532},
 'Chinese': {'Best API': 'Azure OCR', 'Best Score': 69.3299715170022},
 'Korean': {'Best API': 'Azure OCR', 'Best Score': 65.66621922905455},
 'Japanese': {'Best API': 'Azure OCR', 'Best Score': 67.1251686885107},
 'Hindi': {'Best API': 'Google Cloud Vision API',
  'Best Score': 77.98745986681595}}