In [1]:
from datasets import (
    load_dataset,
    DatasetDict,
)
from tqdm import tqdm
from transformers import pipeline
import numpy as np
import os
import librosa
import torch
import gc
from concurrent.futures import ThreadPoolExecutor
os.environ["CUDA_VISIBLE_DEVICES"] = "2" # if you have access to a GPU, otherwise comment this line. If you have a single GPU, you can set it to "0". If you have multiple GPUs, you can set it to "0,1" for example.

  from .autonotebook import tqdm as notebook_tqdm


# Transcription

In [24]:
def load_pipeline(model_id):
    return pipeline("automatic-speech-recognition", model=model_id)

def resample_audio(audio_signals, sample_rates, target_sr=16000):
    """Resample audio signals in parallel."""
    def resample(signal, sr):
        if sr != target_sr:
            return librosa.resample(signal, orig_sr=sr, target_sr=target_sr)
        return signal

    with ThreadPoolExecutor() as executor:
        resampled_signals = list(executor.map(resample, audio_signals, sample_rates))
    return resampled_signals

def batch_transcription(audios, transcriptor, model_name, batch_size=4):
    """Optimized batch transcription function for batch inputs."""
    
    # Check if audios is a list or single dictionary, make sure it works with batch_size=1
    if isinstance(audios, dict):
        audios = [audios]  # Convert to a list for consistency
    
    # Extract audio signals and sampling rates
    audio_signals = np.array([audio['array'] for audio in audios], dtype=np.float32)
    sample_rates = [audio['sampling_rate'] for audio in audios]

    # Normalize audio signals
    max_values = np.abs(audio_signals).max(axis=1, keepdims=True)
    audio_signals = np.where(max_values > 1.0, audio_signals / 32768.0, audio_signals) # Normalize int16 to float32

    # Resample all audio signals to 16kHz in parallel
    audio_signals = resample_audio(audio_signals, sample_rates, target_sr=16000)

    # Transcribe the batch
    results = transcriptor(audio_signals, batch_size=batch_size)

    # Extract the transcriptions
    transcriptions = [result["text"] for result in results]

    return {f'{model_name}': transcriptions}


# Load Eval Dataset

In [3]:
dataset = load_dataset("atlasia/Moroccan-Darija-Youtube-Commons-Eval", split="validation")

In [4]:
dataset

Dataset({
    features: ['audio', 'transcription', 'language', 'dataset_source', 'duration'],
    num_rows: 105
})

# Run evaluation

In [14]:
# Models paths
MODEL_PATHS_AND_BATCH_SIZE_DICT = {
    "BounharAbdelaziz/Morocco-Darija-STT-tiny"          : 4,
    "BounharAbdelaziz/Morocco-Darija-STT-small"         : 4,
    "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2"    : 4,
}

In [25]:
for model_name, batch_size in tqdm(MODEL_PATHS_AND_BATCH_SIZE_DICT.items(), desc="Processing Models"): 
    
    print(f"[INFO] Transcribing using model: {model_name}...")
    transcriber =  load_pipeline(model_name)

    # Apply transcription with batching
    dataset= dataset.map(
        lambda examples: batch_transcription(examples['audio'], transcriber, model_name, batch_size),
        batched=True,  # Enable batching
        batch_size=batch_size,
        desc="Transcribing...",
    )
    
    print(f'[INFO] Finished Transcribing with Model: {model_name}...')
    
    # Free memory
    gc.collect()
    torch.cuda.empty_cache()

Processing Models:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Transcribing using model: BounharAbdelaziz/Morocco-Darija-STT-tiny...


Device set to use cuda:0
Transcribing...:   0%|          | 0/105 [00:00<?, ? examples/s]
Processing Models:   0%|          | 0/3 [00:02<?, ?it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

In [12]:
dataset

Dataset({
    features: ['audio', 'transcription', 'language', 'dataset_source', 'duration', 'BounharAbdelaziz/Morocco-Darija-STT-tiny', 'BounharAbdelaziz/Morocco-Darija-STT-small', 'BounharAbdelaziz/Morocco-Darija-STT-large-v1.2'],
    num_rows: 105
})

In [13]:
dataset.push_to_hub( "atlasia/Moroccan-Darija-Youtube-Commons-Evaluated")

Map: 100%|██████████| 105/105 [00:00<00:00, 3488.80 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 86.75ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/atlasia/Moroccan-Darija-Youtube-Commons-Evaluated/commit/b210b1811df31f3709b960fb619e9a5a49b621d8', commit_message='Upload dataset', commit_description='', oid='b210b1811df31f3709b960fb619e9a5a49b621d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/atlasia/Moroccan-Darija-Youtube-Commons-Evaluated', endpoint='https://huggingface.co', repo_type='dataset', repo_id='atlasia/Moroccan-Darija-Youtube-Commons-Evaluated'), pr_revision=None, pr_num=None)