In [1]:
# Check hardware env
import os
import torch

print(f'PyTorch version: {torch.__version__}')

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

PyTorch version: 2.4.1+cu121
Number of GPUs: 2
GPU 0: NVIDIA A100-PCIE-40GB
GPU 1: NVIDIA A100-PCIE-40GB
Using device: cuda


In [2]:
from datasets import load_dataset, DatasetDict

data_dict = load_dataset("parquet", data_files="../data/fr.parquet")

In [3]:
data_dict = data_dict['train'].train_test_split(0.2)

In [4]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['Id', 'audio', 'sentence'],
        num_rows: 240
    })
    test: Dataset({
        features: ['Id', 'audio', 'sentence'],
        num_rows: 60
    })
})

In [5]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small", language="fr", task="transcribe"
)

In [6]:
data_dict["train"].features

{'Id': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [7]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["sentence"],
    )

    # Lenght in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

In [8]:
data_dict = data_dict.map(
    prepare_dataset, remove_columns=data_dict.column_names["train"], num_proc=1
)

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [9]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [10]:
def count_samples(data_dict: DatasetDict) -> int:
    train_count = data_dict['train'].num_rows
    test_count = data_dict['test'].num_rows
    return train_count + test_count

In [11]:
before = count_samples(data_dict)
print(f"Before filter: {before}")

data_dict["train"] = data_dict["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

after = count_samples(data_dict)
print(f"After filter: {after}")

Before filter: 300


Filter:   0%|          | 0/240 [00:00<?, ? examples/s]

After filter: 300


In [12]:
from dataclasses import dataclass
from typing import List, Dict, Any, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyway
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [14]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [15]:
import evaluate

metric = evaluate.load("wer")

In [16]:
import evaluate

def compute_metrics(pred):
    # Ensure predictions are a list of integers if not already
    pred_ids = pred.predictions
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.detach().cpu().tolist()  # Convert to list if tensor
    
    # Flatten the list if it's nested
    if any(isinstance(i, list) for i in pred_ids):
        pred_ids = [item for sublist in pred_ids for item in sublist]

    label_ids = pred.label_ids
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.detach().cpu().tolist()  # Convert to list if tensor

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Calculate metrics (e.g., WER)
    metrics = {"wer": 100 * wer_metric.compute(predictions=pred_str, references=label_str)}
    return metrics

In [17]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [18]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [19]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(
    model.generate, language="fr", task="transcribe", use_cache=True
)

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="../models/fr",
    num_train_epochs=40,
    per_device_train_batch_size=4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='logs/',
    logging_steps=150,
    eval_strategy="steps",
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    save_total_limit=2,
    load_best_model_at_end=True
)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

  trainer = Seq2SeqTrainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 