[Reference](https://medium.com/@johnidouglasmarangon/ocr-tools-my-latest-study-solving-real-world-problems-with-low-quality-images-dce26cbcdf9a)

# Generative AI — OpenAI

In [1]:
from openai import OpenAI # pip install openai
from base64 import b64encode


# Generate APIKey https://platform.openai.com/api-keys
OPENAI_API_KEY = ""

image_path = "image.jpg"

with open(image_path, "rb") as image_file:
   base64_image = b64encode(image_file.read()).decode("utf-8")


client = OpenAI(api_key=OPENAI_API_KEY)

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "OCR this image. Do not include any markdown or code formatting.",
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    },
                },
            ],
        }
    ],
)

print(response.choices[0].message.content)

# Generative AI — Gemini

In [2]:
from PIL import Image # pip install Pillow
import google.generativeai as genai # pip install google.generativeai

# Generate APIkey - https://aistudio.google.com/apikey
GEMINI_API_KEY = ""

image_path = "image.jpg" # Put here your image path
model_name="gemini-1.5-flash"

genai.configure(api_key=GEMINI_API_KEY)

image_file = Image.open(image_path)
model = genai.GenerativeModel(model_name=model_name)

prompt = "OCR this image. Do not include any markdown or code formatting."

response = model.generate_content([prompt, image_file])

print(response.text)

# Vision AI— Google Cloud

In [3]:
import os
from google.cloud import vision # pip install google-cloud-vision


# Put here your credentials json file
# https://developers.google.com/workspace/guides/create-credentials?hl=en
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""

client = vision.ImageAnnotatorClient()

with open("image.jpg", "rb") as image_file:
    content = image_file.read()
    image = vision.Image(content=content)

response = client.text_detection(image=image)

for annotation in response.text_annotations:
    print("Detected Text:", annotation.description)

# Vision AI — Azure

In [4]:
# pip install azure-ai-vision-imageanalysis
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential


# Generate APIkey - https://azure.microsoft.com/en-us/products/ai-services/ai-vision
AZURE_VISION_API_KEY = ""

image_path = "image.jpg"

with open(image_path, "rb") as f:
    image_data = f.read()

# You need to create an Azure Computer Vision AI services
# https://portal.azure.com/
region = "eastus"
endpoint = "https://<instance name>.cognitiveservices.azure.com/"

client = ImageAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(AZURE_VISION_API_KEY),
    region=region,
)

response = client.analyze(
    image_data,
    visual_features=[VisualFeatures.READ],
)

if response.read is not None:
    for line in response.read.blocks[0].lines:
        print("Detected Text:", line.text)

# Open Source Libraries — Tesseract

In [5]:
import pytesseract # pip install pytesseract
import cv2 # pip install opencv-contrib-python


image = cv2.imread('image.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply dinarization converting the image to black-and-white.
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

# Tesseract allows you to specify the language of the text and configure settings
# like page segmentation mode (PSM). For low-quality images,
# using --psm 6 (assume a single uniform block of text)
# or --psm 11 (sparse text) can yield better results.
config = "-l por --oem 1 --psm 11"
text = pytesseract.image_to_string(image, config=config)
print(text)

# Open Source Libraries — EasyOCR

In [6]:
import easyocr # !pip install easyocr


reader = easyocr.Reader(['pt'])
results = reader.readtext('image.jpg')

for (bbox, text, confidence) in results:
    print(f"Detected text: {text} (Confidence: {confidence:.2f})")

# Open Source Libraries — Surya

In [7]:
from PIL import Image # pip install Pillow

# pip install surya-ocr
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor


image_path = "image.jpg"

image = Image.open(image_path)

langs = ["pt"]
recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

predictions = recognition_predictor([image], [langs], detection_predictor)
for prediction in predictions:
  for line in prediction.text_lines:
      print(line.text)

# Open Source Libraries — DocTR

In [8]:
from doctr.io import DocumentFile # !pip install "python-doctr[torch]"
from doctr.models import ocr_predictor


images_path = "image.jpg"
doc = DocumentFile.from_images(images_path)

model = ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True)

result = model(doc)

for page in result.pages:
  for block in page.blocks:
      for line in block.lines:
          texts = [word.value for word in line.words]
          print(texts)