<a href="https://colab.research.google.com/github/ARJUN108-verma/Elite_Tech_internship/blob/main/Speech_Recognition_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Speech Recognition System


In [None]:
import os
import wave
import numpy as np
from typing import Optional, Tuple
!pip install speech_recognition
import speech_recognition as sr

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

class SpeechToTextSystem:
    """
    A basic speech recognition system with two approaches:
    1. Using SpeechRecognition library (online/offline)
    2. Using Wav2Vec2 model (offline)
    """

    def __init__(self):
        """
        Initialize the speech recognition system with optional Wav2Vec model loading.
        """
        self.recognizer = sr.Recognizer()
        self.wav2vec_processor = None
        self.wav2vec_model = None

    def load_wav2vec_model(self, model_name: str = "facebook/wav2vec2-base-960h"):
        """
        Load the Wav2Vec model for offline transcription.

        Args:
            model_name: Name of the pre-trained Wav2Vec model
        """
        try:
            self.wav2vec_processor = Wav2Vec2Processor.from_pretrained(model_name)
            self.wav2vec_model = Wav2Vec2ForCTC.from_pretrained(model_name)
            print(f"Loaded Wav2Vec model: {model_name}")
        except Exception as e:
            raise RuntimeError(f"Failed to load Wav2Vec model: {str(e)}")

    def transcribe_with_speechrecognition(
        self,
        audio_file: str,
        engine: str = "google",
        language: str = "en-US"
    ) -> Tuple[bool, str]:
        """
        Transcribe audio using SpeechRecognition library.

        Args:
            audio_file: Path to audio file (WAV format recommended)
            engine: Recognition engine to use ("google", "sphinx", etc.)
            language: Language code for recognition

        Returns:
            Tuple of (success, transcription)
        """
        if not os.path.exists(audio_file):
            return False, f"Audio file not found: {audio_file}"

        try:
            with sr.AudioFile(audio_file) as source:
                audio_data = self.recognizer.record(source)

                if engine == "google":
                    text = self.recognizer.recognize_google(audio_data, language=language)
                elif engine == "sphinx":
                    text = self.recognizer.recognize_sphinx(audio_data, language=language)
                else:
                    return False, f"Unsupported engine: {engine}"

                return True, text
        except sr.UnknownValueError:
            return False, "Could not understand audio"
        except sr.RequestError as e:
            return False, f"Recognition service error: {str(e)}"
        except Exception as e:
            return False, f"Error during transcription: {str(e)}"

    def transcribe_with_wav2vec(self, audio_file: str) -> Tuple[bool, str]:
        """
        Transcribe audio using Wav2Vec model (offline).

        Args:
            audio_file: Path to audio file (WAV format, 16kHz recommended)

        Returns:
            Tuple of (success, transcription)
        """
        if self.wav2vec_model is None or self.wav2vec_processor is None:
            return False, "Wav2Vec model not loaded"

        if not os.path.exists(audio_file):
            return False, f"Audio file not found: {audio_file}"

        try:
            # Read audio file
            with wave.open(audio_file, "rb") as wav_file:
                frames = wav_file.readframes(wav_file.getnframes())
                audio_array = np.frombuffer(frames, dtype=np.int16)
                sample_rate = wav_file.getframerate()

                # Resample if needed (Wav2Vec expects 16kHz)
                if sample_rate != 16000:
                    audio_array = self._resample_audio(audio_array, sample_rate, 16000)

                # Process audio
                inputs = self.wav2vec_processor(
                    audio_array,
                    sampling_rate=16000,
                    return_tensors="pt",
                    padding=True
                )

                # Perform inference
                with torch.no_grad():
                    logits = self.wav2vec_model(inputs.input_values).logits

                # Decode output
                predicted_ids = torch.argmax(logits, dim=-1)
                transcription = self.wav2vec_processor.batch_decode(predicted_ids)[0]

                return True, transcription
        except Exception as e:
            return False, f"Error during Wav2Vec transcription: {str(e)}"

    def _resample_audio(
        self,
        audio_array: np.ndarray,
        orig_sr: int,
        target_sr: int
    ) -> np.ndarray:
        """
        Simple resampling of audio data.

        Args:
            audio_array: Input audio data
            orig_sr: Original sample rate
            target_sr: Target sample rate

        Returns:
            Resampled audio data
        """
        duration = len(audio_array) / orig_sr
        new_length = int(duration * target_sr)
        return np.interp(
            np.linspace(0, len(audio_array), new_length),
            np.arange(len(audio_array)),
            audio_array
        )


if __name__ == "__main__":
    import torch  # Import here to make installation optional if not using Wav2Vec

    print("Speech Recognition System")
    print("-------------------------\n")

    # Initialize the system
    stt = SpeechToTextSystem()

    # Example audio file (replace with your own)
    audio_file = "example.wav"

    # Option 1: Use SpeechRecognition (online - requires internet)
    print("\nUsing SpeechRecognition (Google Web API):")
    success, transcription = stt.transcribe_with_speechrecognition(audio_file)
    if success:
        print("Transcription:")
        print(transcription)
    else:
        print(f"Error: {transcription}")

    # Option 2: Use Wav2Vec (offline - requires model download)
    try:
        print("\nLoading Wav2Vec model (this may take a while first time)...")
        stt.load_wav2vec_model()

        print("\nUsing Wav2Vec (Offline):")
        success, transcription = stt.transcribe_with_wav2vec(audio_file)
        if success:
            print("Transcription:")
            print(transcription)
        else:
            print(f"Error: {transcription}")
    except Exception as e:
        print(f"Couldn't use Wav2Vec: {str(e)}")
        print("Make sure you have torch and transformers installed:")
        print("pip install torch transformers")

[31mERROR: Could not find a version that satisfies the requirement speech_recognition (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for speech_recognition[0m[31m
[0mSpeech Recognition System
-------------------------


Using SpeechRecognition (Google Web API):
Error: Audio file not found: example.wav

Loading Wav2Vec model (this may take a while first time)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded Wav2Vec model: facebook/wav2vec2-base-960h

Using Wav2Vec (Offline):
Error: Audio file not found: example.wav
