In [None]:
! pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-twcogm6k
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-twcogm6k
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_c

In [None]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

drive  sample_data


In [None]:
# Let's list the vailable language on whisper
print(whisper.tokenizer.LANGUAGES)

{'en': 'english', 'zh': 'chinese', 'de': 'german', 'es': 'spanish', 'ru': 'russian', 'ko': 'korean', 'fr': 'french', 'ja': 'japanese', 'pt': 'portuguese', 'tr': 'turkish', 'pl': 'polish', 'ca': 'catalan', 'nl': 'dutch', 'ar': 'arabic', 'sv': 'swedish', 'it': 'italian', 'id': 'indonesian', 'hi': 'hindi', 'fi': 'finnish', 'vi': 'vietnamese', 'he': 'hebrew', 'uk': 'ukrainian', 'el': 'greek', 'ms': 'malay', 'cs': 'czech', 'ro': 'romanian', 'da': 'danish', 'hu': 'hungarian', 'ta': 'tamil', 'no': 'norwegian', 'th': 'thai', 'ur': 'urdu', 'hr': 'croatian', 'bg': 'bulgarian', 'lt': 'lithuanian', 'la': 'latin', 'mi': 'maori', 'ml': 'malayalam', 'cy': 'welsh', 'sk': 'slovak', 'te': 'telugu', 'fa': 'persian', 'lv': 'latvian', 'bn': 'bengali', 'sr': 'serbian', 'az': 'azerbaijani', 'sl': 'slovenian', 'kn': 'kannada', 'et': 'estonian', 'mk': 'macedonian', 'br': 'breton', 'eu': 'basque', 'is': 'icelandic', 'hy': 'armenian', 'ne': 'nepali', 'mn': 'mongolian', 'bs': 'bosnian', 'kk': 'kazakh', 'sq': 'alb

Wolof language is not found. Our goal is to make it available for transciption tasks

In [None]:
import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian",
             "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech",
             "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian",
             "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa",
             "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian",
             "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer",
             "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian",
             "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese",
             "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish",
             "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian",
             "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu",
             "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese",
             "yo_ng": "Yoruba","wo_sn":"Wolof"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="ko_kr",
    description='Language:',
    disabled=False,
)

selection

Dropdown(description='Language:', index=39, options=(('Select language', None), ('----------', None), ('Afrika…

In [None]:
lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")

Selected language: Wolof (wo_sn)


# Downloading the wolof data.
Since asr wolof datasets are available in some datasets like fleurs, alffa public, galsenAI in hunging face, we can easily download one.

In [None]:
def download(url: str, target_path: str):
    with urllib.request.urlopen(url) as source, open(target_path, "wb") as output:
        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
            while True:
                buffer = source.read(8192)
                if not buffer:
                    break

                output.write(buffer)
                loop.update(len(buffer))


class Fleurs(torch.utils.data.Dataset):
    """
    A simple class to wrap Fleurs and subsample a portion of the dataset as needed.
    """
    def __init__(self, lang, split="test", subsample_rate=1, device=DEVICE):
        url = f"https://storage.googleapis.com/xtreme_translations/FLEURS102/{lang}.tar.gz"
        tar_path = os.path.expanduser(f"~/.cache/fleurs/{lang}.tgz")
        os.makedirs(os.path.dirname(tar_path), exist_ok=True)

        if not os.path.exists(tar_path):
            download(url, tar_path)

        all_audio = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            for member in tar.getmembers():
                name = member.name
                if name.endswith(f"{split}.tsv"):
                    labels = pd.read_table(tar.extractfile(member), names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))

                if f"/{split}/" in name and name.endswith(".wav"):
                    audio_bytes = tar.extractfile(member).read()
                    all_audio[os.path.basename(name)] = wavfile.read(io.BytesIO(audio_bytes))[1]

        self.labels = labels.to_dict("records")[::subsample_rate]
        self.all_audio = all_audio
        self.device = device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        record = self.labels[item]
        audio = torch.from_numpy(self.all_audio[record["file_name"]].copy())
        text = record["transcription"]

        return (audio, text)

In [None]:
dataset = Fleurs(lang, subsample_rate=10) # subsample 10% of the dataset for a quick demo

  0%|                                              | 0.00/1.71G [00:00<?, ?iB/s]

In [None]:
dataset[0]

(tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0031, -0.0046, -0.0052]),
 'misigu pyramid ak lumiyeer bu nuy wone mooy mbir yu gëna am solo ci diwaan gi ngir xale yi')

In [None]:
type(dataset)

In [None]:
for a, t in dataset:
  print(a.shape, t)


torch.Size([211200]) misigu pyramid ak lumiyeer bu nuy wone mooy mbir yu gëna am solo ci diwaan gi ngir xale yi
torch.Size([213120]) jëfandikoo ay gaal ngir yóbbu ay marsandiss mooy anam bi gëna baax ngir jàllale nit ñu bare ak marsandiis yu bare ci géej yi
torch.Size([112320]) lii yoon bu am solo la ngir rañélé verbs yi ak objects yi
torch.Size([105600]) xalaatal dem def eski nduruna ak doxantu
torch.Size([186240]) mbalit yiii amoon nanu solo ngir defar ay bangu suuf ak ak tefes yu don nek dëkuway mala yi
torch.Size([375360]) sundarbans bookna ci barab yi unesco di woowe barab yu ànduna bi yépp bokk wàllu àll bi bokk ci réewum inde lañuy woowe park national bu sundarbans yi
torch.Size([351360]) soo bëggee gis taaru hong kong danga fay joge nga dem ci kowloon nekk ci géej gi mu jàkkarlool dinga gis taar bi
torch.Size([250560]) ci misaal nataal bu gënna siiw ci wàllu foto ak tolluwaayam ci àdduna mooy 35mm mu nekkoon taayu film bi gënana am doole ci wetu jamono filmu analog bi
torch.Siz

In [None]:
from datasets import Dataset
import torchaudio

fleurs_data = []
for a, t in dataset:
  fleurs_data.append((a, t))

# Assuming a default sampling rate of 16000 Hz
formatted_data = []

for waveform, transcription in fleurs_data:
    formatted_data.append({
        "audio": {"array": waveform.cpu().numpy(), "sampling_rate": 16000},
        "text": transcription
    })

hf_dataset = Dataset.from_list(formatted_data)


In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_name = "openai/whisper-small"  # or another Whisper model
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Make sure the processor is prepared for transcription
processor.tokenizer.set_prefix_tokens(language="wol", task="transcribe")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [None]:

def prepare_example(example):
    # Process input audio
    inputs = processor(
        example["audio"]["array"],
        sampling_rate=example["audio"]["sampling_rate"],
        return_tensors="pt"
    )
    input_features = inputs.input_features[0]

    # Process transcription, use target processor instead of as_target_processor
    labels = processor.tokenizer(example["text"], add_special_tokens=False).input_ids

    return {
        "input_features": input_features,
        "labels": labels
    }

processed_dataset = hf_dataset.map(prepare_example)


Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class WhisperDataCollatorWithPadding:
    processor: Any
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors=self.return_tensors,
        )

        labels_batch = self.processor.tokenizer.pad(
            label_features,
            return_tensors=self.return_tensors,
        )

        # Replace padding token id's of the labels by -100 so they are ignored by the loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100)
        batch["labels"] = labels

        return batch


In [None]:
data_collator = WhisperDataCollatorWithPadding(processor=processor)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./whisper_wolof",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    learning_rate=1e-5,
    fp16=True,
    # data_collator=data_collator,
    dataloader_pin_memory=False, # This might be needed if you run into CUDA memory issues
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor
)

trainer.train()


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmbabou[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss




TrainOutput(global_step=30, training_loss=3.4873700459798176, metrics={'train_runtime': 113.9205, 'train_samples_per_second': 1.001, 'train_steps_per_second': 0.263, 'total_flos': 3.289873563648e+16, 'train_loss': 3.4873700459798176, 'epoch': 3.0})

## Saving the model

In [None]:
project_path = 'drive/My Drive/Speech Recognition project/'

In [None]:
trainer.save_model(project_path+"/whisper_wolof_babou")
processor.save_pretrained(project_path+"/whisper_wolof_babou")

[]

In [None]:
!ls 'drive/My Drive/Speech Recognition project/whisper-wolof_babou'

added_tokens.json	model.safetensors	  tokenizer_config.json
config.json		normalizer.json		  training_args.bin
generation_config.json	preprocessor_config.json  vocab.json
merges.txt		special_tokens_map.json


## Evaluation and test on new data sample

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_path = "drive/My Drive/Speech Recognition project/whisper-wolof_babou"  # or your own path
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [None]:
project_path = 'drive/My Drive/Speech Recognition project/'

In [None]:
df = pd.read_csv(project_path+"Wavtext_dataset2.csv")
df.head()

Unnamed: 0,text,path_wav
0,Nu ngi indee woon jiwu ji sapoN,recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_0.wav
1,Nu ngi tollu dagana,recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_1.wav
2,Nu ngi wàkkirlu ci yàlla,recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_2.wav
3,Nu ñow nag def ko ci kër gi,recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_3.wav
4,Nuggaayu néeg bee ko fi téye,recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_4.wav


In [None]:
exemple = project_path+df.loc[1, "path_wav"]
exemple

'drive/My Drive/Speech Recognition project/recordings/200702-231930_wol_8ec_elicit/200702-231930_wol_8ec_elicit_1.wav'

In [None]:
import torchaudio

# Load audio
waveform, sample_rate = torchaudio.load(exemple)

# Resample if needed
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Whisper expects a 1D array
audio_input = waveform.squeeze().numpy()