In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install git+https://github.com/m-bain/whisperx.git

In [3]:
import whisperx
import os
import json

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


# WhisperX Audio Transcription and Diarization

## Parameters Initialization

In [4]:
device = "cuda"                     # device = "cpu" for running locally | device = "cuda" for running in google colab
batch_size = 16                     # reduce if low on GPU memory
compute_type = "float16"            # change to "int8" if low on GPU memory (may reduce accuracy)
language = "de"                     # german language

## Loading the Large-v2 model from WhisperX

In [5]:
# loading the large-v2 model with german language specified
model = whisperx.load_model("large-v2", device, compute_type=compute_type, language=language)

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu121. Bad things might happen unless you revert torch to 1.x.


## Transcribing, Aligning & Diarization of the Audio with the Original Whisper (batched)

In [6]:
def transcribe_audio_diarization(device, audio_file, batch_size) :
    """Load the audio file and return its transcribe with the alignment"""

    # 1. transcribing the audio before alignment
    audio = whisperx.load_audio(audio_file)
    result_before_align = model.transcribe(audio, batch_size=batch_size, language="de")

    # 2. transcribing the audio after alignment
    model_a, metadata = whisperx.load_align_model(language_code=result_before_align["language"], device=device)
    result_after_align = whisperx.align(result_before_align["segments"], model_a, metadata, audio, device, return_char_alignments=False)

    # 3. assign speaker labels
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

    # 4. add min/max number of speakers if known diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
    diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=5)

    # 5. final result : transcript with alignment and diarization
    result = whisperx.assign_word_speakers(diarize_segments, result_after_align)

    return result

## Creating a JSON File for each Transcript

In [None]:
base_path = 'drive/MyDrive/ml4science'

# Specify the directory for audio files
audio_directory = f'{base_path}/wavs/'

# Specify the directory for the JSON files
json_directory = f'{base_path}/json/'

# List all files in the audio directory and sort them
audio_files = sorted([f for f in os.listdir(audio_directory) if os.path.isfile(os.path.join(audio_directory, f))])

# Printing the number of audios files
print(len(audio_files))


In [None]:
# Iterate through each audio file - V100
# for audio_file in audio_files[0:29]: # - 1h19 min 55 s
# for audio_file in audio_files[29:58]: # - 1h09 min 11 s
# for audio_file in audio_files[58:87]: # - 1h11 min 33 s
# for audio_file in audio_files[87:116]: # - 1h13 min 48 s

for audio_file in audio_files :

    # Check if the file has a valid audio file extension
    valid_audio_extensions = ['.wav']  # Add more extensions if needed
    if not any(audio_file.lower().endswith(ext) for ext in valid_audio_extensions):
        continue  # Skip non-audio files

    # Create the full path for the current audio file
    audio_file_path = os.path.join(audio_directory, audio_file)

    # Get the filename (without extension)
    file_name = os.path.splitext(os.path.basename(audio_file))[0]

    # Specify the JSON file path
    json_file_path = os.path.join(json_directory, f'transcript_{file_name}.json')

    # Print the current file being processed
    print(f"Processing: {audio_file}")

    # Perform transcription and alignment for the current audio file
    result = transcribe_audio_diarization(device=device, audio_file=audio_file_path, batch_size=batch_size)

    # Write the result to the JSON file
    with open(json_file_path, 'w') as jsonfile:
        json.dump(result, jsonfile, indent=2)

    # Printing when the current file as been processed
    print(f"Done Processing: {audio_file}")
