In [6]:
# Install necessary libraries
!pip install transformers torch easyocr gradio



In [7]:
# Import the necessary libraries
import easyocr
import gradio as gr
import re

In [12]:
# Initialize EasyOCR Reader with English and Hindi support
reader = easyocr.Reader(['en', 'hi'])  # English and Hindi language support

def clean_and_format_text(text_pieces):
    """
    This function cleans and formats the extracted text pieces
    into proper sentences with appropriate spacing and punctuation.
    """
    # Combine text into a single string
    combined_text = ' '.join(text_pieces)

    # Basic cleaning: remove extra spaces, ensure punctuation is attached properly
    combined_text = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', combined_text)  # Fix spaces before punctuations
    combined_text = re.sub(r'\s+', ' ', combined_text).strip()  # Remove extra spaces

    # Capitalize the first word of the sentence
    if combined_text:
        combined_text = combined_text[0].upper() + combined_text[1:]

    return combined_text

def highlight_keywords(text, keyword):
    """
    This function highlights the matching keyword in the extracted text.
    """
    # Use HTML <mark> tag to highlight the keyword
    highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
    return highlighted_text

def ocr_and_search(image, keyword):
    """
    Perform OCR on the uploaded image, search for a keyword, and return the results.
    """
    # Perform OCR using EasyOCR
    result = reader.readtext(image, detail=0)  # detail=0 returns only text pieces

    # Clean and format the extracted text into proper sentences
    formatted_text = clean_and_format_text(result)

    # If keyword is provided, highlight it in the text
    if keyword:
        highlighted_text = highlight_keywords(formatted_text, keyword)
    else:
        highlighted_text = formatted_text
    return highlighted_text




In [13]:
# Create Gradio interface for the web application
interface = gr.Interface(
    fn=ocr_and_search,  # Function to call for OCR and search
    inputs=[
        gr.Image(type="filepath"),  # Input: Image file path for OCR
        gr.Textbox(label="Enter keyword to search (optional)")  # Input: Textbox for keyword search
    ],
    outputs=gr.HTML(label="Extracted Text with Search Highlights"),  # Output: Display the text with highlights
    title="OCR with Search and Keyword Highlighting",  # Title for the app
    description="Upload an image to extract and format text into proper sentences. Supports Hindi and English. You can search for a specific keyword in the extracted text, and the keyword will be highlighted."
)

In [14]:
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://917b3d54a65b3bee2b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


