In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install whisperx pydub soundfile

Collecting whisperx
  Downloading whisperx-3.1.6-py3-none-any.whl.metadata (13 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting faster-whisper (from whisperx)
  Downloading faster_whisper-1.0.3-py3-none-any.whl.metadata (15 kB)
Collecting pyannote.audio==3.1.1 (from whisperx)
  Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.1.1->whisperx)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio==3.1.1->whisperx)
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting omegaconf<3.0,>=2.1 (from pyannote.audio==3.1.1->whisperx)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio==3.1.1->whisperx)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from 

In [1]:
import os
import re
import whisperx
from pydub import AudioSegment

class AudioTooLongError(Exception):
    """Custom exception for audio files that are too long."""
    pass

# Mapping of spelled-out digits to their numeric forms
digit_mapping = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9'
}

def clean_transcription(transcribed_text):
    # Use regex to convert spelled-out digits to numeric form
    words = transcribed_text.split()
    cleaned_words = []

    for word in words:
        # Check if the word is a spelled-out digit and replace it
        if word.lower() in digit_mapping:
            cleaned_words.append(digit_mapping[word.lower()])
        else:
            cleaned_words.append(word)

    # Join the cleaned words back into a sentence
    cleaned_text = ' '.join(cleaned_words)

    # Use regex to remove spaces between digits
    cleaned_text = re.sub(r'(?<=\d) (?=\d)', '', cleaned_text)  # Remove spaces between digits

    # Remove dashes between digits (e.g., '230-880-252' -> '230880252')
    cleaned_text = re.sub(r'(?<=\d)-(?=\d)', '', cleaned_text)

    return cleaned_text

def transcribe_audio(file_path):
    # Check for supported file types
    if not file_path.endswith(('.mp3', '.wav')):
        raise ValueError("Unsupported file format. Please use .mp3 or .wav files.")

    # Load the audio file
    audio = AudioSegment.from_file(file_path)

    # Get the duration of the audio in seconds
    duration = len(audio) / 1000  # Duration in seconds

    # Check if the audio is longer than 40 seconds
    if duration > 40:
        raise AudioTooLongError("Response voice is too long.")

    # Load the WhisperX model
    hotwords = []
    model = whisperx.load_model("large-v3", device="cpu", compute_type="float32", asr_options={"hotwords": hotwords})  # Change the model type if needed

    # Load and transcribe the audio
    audio_data = whisperx.load_audio(file_path)

    # Perform transcription
    transcription = model.transcribe(audio_data, language='en')

    # Debugging: Print the transcription output
    print(transcription)

    # Extract the text from all segments
    if 'segments' in transcription and len(transcription['segments']) > 0:
        transcribed_text = ' '.join([segment['text'] for segment in transcription['segments']])  # Combine all segments' text

        # Clean the transcribed text
        cleaned_text = clean_transcription(transcribed_text)
        return cleaned_text
    else:
        raise RuntimeError("Transcription failed or output format has changed.")

# Example usage
file_path = '/content/drive/MyDrive/drive-download-20241116T142637Z-001/Audio_Files/Mp3/RVN3.mp3'  # Replace with your audio file path

try:
    result = transcribe_audio(file_path)
    print(f"Transcribed Text: {result}")
except (AudioTooLongError, ValueError, RuntimeError) as e:
    print(f"Error: {e}")


  torchaudio.set_audio_backend("soundfile")
  backend = torchaudio.get_audio_backend()
  from speechbrain.pretrained import (
  torchaudio.set_audio_backend(backend)
  from torchaudio.backend.common import AudioMetaData
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:02<00:00, 6.43MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.
{'segments': [{'text': ' Provide the freight term and type linked with house bill AZN2020486. Provide the house bill numbers that originated from AEDXB and are heavier than 250 kg. Provide the house bill numbers', 'start': 1.101, 'end': 30.691}, {'text': ' that originated from CDMAT and are heavier than 250 kg.', 'start': 31.152, 'end': 39.65}], 'language': 'en'}
Transcribed Text: Provide the freight term and type linked with house bill AZN2020486. Provide the house bill numbers that originated from AEDXB and are heavier than 250 kg. Provide the house bill numbers that originated from CDMAT and are heavier than 250 kg.
