In [1]:
!pip install -U openai-whisper
!pip install jiwer

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-macosx_11_0_arm64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25ldone
[?25h  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-any.whl size=803404 sha256=ba85d2fe224c65fe07f65fc4c3638184ca639eb4c7

In [2]:
import whisper
from jiwer import wer  # For Word Error Rate

# Load Whisper model
model = whisper.load_model("small")

def speech_to_text(audio_path):
    """
    Transcribes speech from an audio file using Whisper.
    """
    try:
        audio = whisper.load_audio(audio_path)
        audio = whisper.pad_or_trim(audio)

        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language (optional)
        _, probs = model.detect_language(mel)
        detected_lang = max(probs, key=probs.get)
        print(f"Detected language: {detected_lang}")

        # Decode speech
        options = whisper.DecodingOptions(fp16=False)
        result = whisper.decode(model, mel, options)

        return result.text.strip()

    except Exception as e:
        print("Error during speech-to-text:", e)
        return None

def calculate_wer(reference_text, recognized_text):
    """
    Calculates Word Error Rate between reference and recognized text.
    """
    return wer(reference_text, recognized_text)

# ======= USAGE =======

audio_path = 'test2.mp3'
reference_text = "சரித்திரத்தை ஒரு நிமிஷம் பாருங்கள். அது நமக்கு கற்றுக் கொடுத்தது ஒன்னு தான். நாம வாழணும்னா யார வேணாலும், எத்தனை பேர வேணாலும் கொல்லலாம்."

recognized_text = speech_to_text(audio_path)

if recognized_text:
    print("Recognized Text:", recognized_text)
    error_rate = calculate_wer(reference_text, recognized_text)
    print("Word Error Rate:", error_rate)
else:
    print("Error: Could not transcribe the audio.")


Detected language: ta
Recognized Text: சரித்துரத்தை ஒரு நவிஷயம் பாருங்கள். அது நாம் கத்துக் கொடுத்தது ஒன்றுதான். நாம் வாழ்க்கவேண்டும் என்றால் யார வேண்டாம்? எத்தனை பெரு வேண்டாம் கொல்லலாம்.
Word Error Rate: 0.6111111111111112
