In [None]:
# Install necessary packages
!apt-get update
!apt-get install -y libgl1-mesa-glx
!pip install easyocr pyspellchecker nltk torch transformers vosk

# Imports


In [4]:

# Import required libraries
import easyocr
from spellchecker import SpellChecker
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from vosk import Model, KaldiRecognizer
import wave
import json
import os



In [5]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dweep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dweep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# Initialize


In [6]:
# Initialize EasyOCR, SpellChecker
ocr_reader = easyocr.Reader(['en'])
spell = SpellChecker()
stop_words = set(stopwords.words('english'))

Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [8]:
import os
import zipfile

# Download the model using curl
os.system('curl -o vosk-model-small-en-us-0.15.zip https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip')

# Unzipping the file using Python's zipfile module
with zipfile.ZipFile('vosk-model-small-en-us-0.15.zip', 'r') as zip_ref:
    zip_ref.extractall('.')  # Extract to the current directory

# Check if the model directory exists
if os.path.exists('vosk-model-small-en-us-0.15'):
    from vosk import Model
    vosk_model = Model('vosk-model-small-en-us-0.15')
    print("Model loaded successfully!")
else:
    print("Model extraction failed.")


Model loaded successfully!


In [9]:
# Initialize the intent classifier model
intent_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
intent_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)
intent_labels = ["read text", "identify object"]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Core Code


In [16]:
# Define a threshold for spell-checking confidence
SPELL_CHECK_THRESHOLD = 0.8  # Confidence threshold (80%)

def clean_text(text):
    """Cleans up text by removing non-alphanumeric characters and fixing common OCR errors."""
    # Replace common OCR errors
    text = text.replace('1', 'l').replace('$', 'S').replace('0', 'o')

    # Remove unnecessary characters or symbols
    text = re.sub(r'[^\w\s.,!?;:]', '', text)  # Keep only alphanumeric characters and basic punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'(?<=\w)[,.!?;:](?=\w)', '', text)  # Remove misplaced punctuation within words

    return text

def spell_check_and_format(text):
    """Performs spell checking with a confidence threshold and improved punctuation handling."""
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    corrected_sentences = []

    for sentence in sentences:
        # Tokenize each sentence into words
        words = word_tokenize(sentence)
        corrected_words = []

        for word in words:
            # Handle words with embedded punctuation wrongly interpreted
            word = re.sub(r'(?<=\w)[,.!?;:](?=\w)', '', word)  # Remove punctuation in the middle of words

            # Check if the word is a proper noun (capitalized word)
            if word.istitle():
                corrected_words.append(word)
            # Ignore common stop words
            elif word.lower() not in stop_words:
                # Get the most likely correction
                corrected_word = spell.correction(word)

                # Get the correction candidates
                correction_candidates = spell.candidates(word)

                # Initialize corrected_word with the original word
                corrected_word = word

                # Calculate confidence score only if corrected_word is valid
                if corrected_word and correction_candidates:
                    word_freq = spell.word_frequency[corrected_word]
                    total_freq = sum([spell.word_frequency[candidate] for candidate in correction_candidates])
                    confidence_score = word_freq / total_freq if total_freq > 0 else 0

                    # Apply correction only if confidence score is above the threshold
                    if confidence_score >= SPELL_CHECK_THRESHOLD:
                        corrected_word = spell.correction(word)

                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)

        # Reconstruct the sentence
        corrected_sentence = ' '.join(corrected_words)

        # Handle special characters and contractions
        corrected_sentence = re.sub(r'\s([?.!"](?:\s|$))', r'\1', corrected_sentence)  # Fix spacing before punctuation
        corrected_sentence = corrected_sentence.replace("n't", " not").replace("'s", " is").replace("'re", " are")

        corrected_sentences.append(corrected_sentence)

    # Join sentences into the final corrected text
    corrected_text = ' '.join(corrected_sentences)

    # Advanced formatting: fixing spacing and ensuring proper punctuation
    formatted_text = corrected_text.replace(' ,', ',').replace(' .', '.').replace(' !', '!').replace(' ?', '?')
    formatted_text = re.sub(r'\s+', ' ', formatted_text)  # Remove extra spaces
    formatted_text = re.sub(r'(\bi\b)', 'I', formatted_text)  # Capitalize the letter "I"

    return formatted_text

def detect_text(image_path):
    """Detects text from an image using EasyOCR, sorts it by layout, and applies spell checking and formatting for TTS."""
    # Perform OCR on the image
    results = ocr_reader.readtext(image_path)

    # Combine text boxes and sort them by their vertical (y) and horizontal (x) coordinates
    sorted_results = sorted(results, key=lambda item: (item[0][0][1], item[0][0][0]))

    # Combine text in the correct reading order
    combined_text = " ".join([text for _, text, _ in sorted_results])

    # Clean the text before spell checking
    cleaned_text = clean_text(combined_text)

    # Apply spell checking and formatting
    spell_checked_text = spell_check_and_format(cleaned_text)

    # Save the corrected and formatted text
    with open('corrected_texts.txt', 'w') as text_file:
        text_file.write(spell_checked_text + '\n')

    print("Raw Detected Texts:\n", "\n".join([text for _, text, _ in sorted_results]))
    print("Cleaned Text:\n", cleaned_text)
    print("Spell Checked and Formatted Text for TTS:\n", spell_checked_text)

def speech_to_text_vosk():
    """Converts speech input to text using Vosk."""
    # Record audio (need to run separately in an actual audio input environment)
    print("Recording...")

    # Use an audio file for testing
    audio_file = 'mc38.wav'

    # Open the audio file
    wf = wave.open(audio_file, "rb")
    rec = KaldiRecognizer(vosk_model, wf.getframerate())
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = rec.Result()
            result_dict = json.loads(result)
            print("Recognized Text:", result_dict.get('text', ''))
            return result_dict.get('text', '').lower()
    result = rec.FinalResult()
    result_dict = json.loads(result)
    print("Recognized Text:", result_dict.get('text', ''))
    return result_dict.get('text', '').lower()

def classify_intent(text):
    """Classifies the intent of the text as either 'read text' or 'identify object'."""
    inputs = intent_tokenizer(text, return_tensors="pt")
    outputs = intent_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    intent_idx = torch.argmax(probs).item()
    intent = intent_labels[intent_idx]
    print(f"Identified Intent: {intent}")
    return intent




In [17]:
def main():
    # Step 1: Speech to Text
    user_input = speech_to_text_vosk()

    if user_input:
        # Step 2: Identify Intent
        intent = classify_intent(user_input)

        # Step 3: Perform action based on intent
        if "read text" in intent:
            # Upload an image to read text from
            uploaded = files.upload()
            for image_name in uploaded.keys():
                detect_text(image_name)
        elif "identify object" in intent:
            print("Object identification not implemented in this example.")
            # Implement object detection code here
        else:
            print("Intent not recognized.")
    else:
        print("No valid input detected.")



In [18]:
# Run the main function
if __name__ == "__main__":
    main()

Recording...
Recognized Text: traffic is moving so slowly in this line
Identified Intent: identify object
Object identification not implemented in this example.
