In [None]:
!pip install gradio langdetect pytesseract --quiet
!pip install pytesseract --quiet
!pip install langdetect --quiet
!pip install google   --quiet

!apt-get update > /dev/null   --quiet
!apt-get install -y tesseract-ocr > /dev/null  --quiet
!pip install pytesseract langdetect > /dev/null  --quiet

!pip install langdetect pytesseract --quiet
!apt-get install -y tesseract-ocr   --quiet
!git clone https://github.com/tesseract-ocr/tessdata_best.git
!cp tessdata_best/hin.traineddata /usr/share/tesseract-ocr/4.00/tessdata/
!cp tessdata_best/eng.traineddata /usr/share/tesseract-ocr/4.00/tessdata/
!pip install gradio langid pytesseract opencv-python --quiet

!apt-get update   --quiet
!apt-get install -y tesseract-ocr \
  tesseract-ocr-eng tesseract-ocr-hin tesseract-ocr-ben \
  tesseract-ocr-tam tesseract-ocr-tel tesseract-ocr-guj \
  tesseract-ocr-mar tesseract-ocr-kan tesseract-ocr-mal \
  tesseract-ocr-ori tesseract-ocr-asm tesseract-ocr-pan \
  tesseract-ocr-san tesseract-ocr-snd   --quiet


In [None]:
import gradio as gr
import os
import cv2
import pytesseract
from pytesseract import Output
from langdetect import detect
import langid
import numpy as np

# Default Tesseract model path
DEFAULT_TESSDATA = "/usr/share/tesseract-ocr/4.00/tessdata/"
# Update this path to where tessdata_best is located on your system
BEST_TESSDATA = "/content/tessdata_best/"

# Define language codes for Tesseract OCR
INDIAN_LANGS = "eng+hin+ben+tam+tel+guj+mar+kan+mal+ori+asm+pan+san+sat+snd"

# Map language codes to full language names
LANGUAGE_NAME_MAP = {
    "en": "English", "hi": "Hindi", "bn": "Bengali", "ta": "Tamil",
    "te": "Telugu", "ml": "Malayalam", "gu": "Gujarati", "mr": "Marathi",
    "kn": "Kannada", "or": "Odia", "as": "Assamese", "pa": "Punjabi",
    "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "und": "Unknown"
}

def is_blurry(image):
    """Check if the image is blurry based on Laplacian variance."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(gray, cv2.CV_64F).var()
    return fm < 80  # Lower value means blurrier

def enhance_if_blurry(img):
    """Apply minimal sharpening if image is blurry."""
    if is_blurry(img):
        print("Applying blur correction...")
        kernel = np.array([[0, -1, 0],
                           [-1, 5, -1],
                           [0, -1, 0]])
        return cv2.filter2D(img, -1, kernel)
    return img

def process_image(image, detection_method, tess_model):
    """
    Process the uploaded image:
      - Optionally enhance blurry images.
      - Set Tesseract model directory based on selection.
      - Perform OCR with Tesseract.
      - Detect language for each text region using the selected method (langdetect or langid).
      - Draw bounding boxes with language labels.
    """
    # Set Tesseract tessdata path based on user selection
    if tess_model == "tessdata_best":
        os.environ["TESSDATA_PREFIX"] = BEST_TESSDATA
    else:
        os.environ["TESSDATA_PREFIX"] = DEFAULT_TESSDATA

    detected_languages = set()
    img_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    img_bgr = enhance_if_blurry(img_bgr)

    # Convert image to grayscale for OCR
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    recognized_text = pytesseract.image_to_string(gray, lang=INDIAN_LANGS)

    data = pytesseract.image_to_data(img_bgr, lang=INDIAN_LANGS, output_type=Output.DICT)
    n_boxes = len(data["text"])

    for i in range(n_boxes):
        try:
            conf = float(data["conf"][i])
        except:
            continue

        if conf > 60:
            word = data["text"][i].strip()
            if len(word) >= 3:
                try:
                    if detection_method == "langdetect":
                        lang_code = detect(word)
                    elif detection_method == "langid":
                        lang_code = langid.classify(word)[0]
                    else:
                        lang_code = "und"
                except Exception:
                    lang_code = "und"

                lang_name = LANGUAGE_NAME_MAP.get(lang_code, "Unknown")
                detected_languages.add(lang_name)
                x, y, w, h = int(data["left"][i]), int(data["top"][i]), int(data["width"][i]), int(data["height"][i])
                cv2.rectangle(img_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)
                cv2.putText(img_bgr, lang_name, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,
                            0.6, (255, 0, 0), 2)

    annotated_img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    languages_bullets = "\n".join("- " + lang for lang in sorted(detected_languages))
    return recognized_text.strip(), annotated_img, languages_bullets

# Define components using updated Gradio syntax
image_input = gr.Image(type="numpy", label="Upload Image (even blurry)")
detection_dropdown = gr.Dropdown(choices=["langdetect", "langid"], value="langdetect",
                                 label="Select Language Detection Method")
tess_model_dropdown = gr.Dropdown(choices=["default", "tessdata_best"], value="default",
                                  label="Select Tesseract Model")

iface = gr.Interface(
    fn=process_image,
    inputs=[image_input, detection_dropdown, tess_model_dropdown],
    outputs=[
        gr.Textbox(label="Extracted Text"),
        gr.Image(type="numpy", label="Annotated Image"),
        gr.Textbox(label="Detected Languages")
    ],
    title="Multilingual OCR with Tesseract Model Support",
    description=("Extracts text from images (even blurry) and detects languages using "
                 "either langdetect or langid. Select the Tesseract model to use for OCR accuracy. "
                 "For better accuracy, choose tessdata_best if available.")
)

iface.launch(pwa=True,debug=True)
