In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset, DatasetDict

vivos = DatasetDict()

vivos["train"] = load_dataset("vivos", split="train", use_auth_token=True)
vivos["test"] = load_dataset("vivos", split="test", use_auth_token=True)

print(vivos)


Found cached dataset vivos (/home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0)
Found cached dataset vivos (/home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0)


DatasetDict({
    train: Dataset({
        features: ['speaker_id', 'path', 'audio', 'sentence'],
        num_rows: 11660
    })
    test: Dataset({
        features: ['speaker_id', 'path', 'audio', 'sentence'],
        num_rows: 760
    })
})


In [3]:
vivos_clean = vivos.remove_columns(["speaker_id", "path"])

In [4]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")


In [5]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Vietnamese", task="transcribe")


In [6]:
input_str = vivos_clean["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 TÌNH YÊU THƯƠNG THẬT SỰ SỰ KIÊN TRÌ VÀ LÍ TƯỞNG TỐT ĐẸP NHẤT ĐỊNH SẼ CHIẾN THẮNG TẤT CẢ TRONG ĐÓ CÓ CẢ ĐÓI NGHÈO VÀ LẠC HẬU
Decoded w/ special:    <|startoftranscript|><|vi|><|transcribe|><|notimestamps|>TÌNH YÊU THƯƠNG THẬT SỰ SỰ KIÊN TRÌ VÀ LÍ TƯỞNG TỐT ĐẸP NHẤT ĐỊNH SẼ CHIẾN THẮNG TẤT CẢ TRONG ĐÓ CÓ CẢ ĐÓI NGHÈO VÀ LẠC HẬU<|endoftext|>
Decoded w/out special: TÌNH YÊU THƯƠNG THẬT SỰ SỰ KIÊN TRÌ VÀ LÍ TƯỞNG TỐT ĐẸP NHẤT ĐỊNH SẼ CHIẾN THẮNG TẤT CẢ TRONG ĐÓ CÓ CẢ ĐÓI NGHÈO VÀ LẠC HẬU
Are equal:             True


In [7]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Vietnamese", task="transcribe")


In [8]:
from datasets import Audio

vivos_clean = vivos_clean.cast_column("audio", Audio(sampling_rate=16000))

In [9]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


In [10]:
vivos_clean = vivos_clean.map(prepare_dataset, remove_columns=vivos_clean.column_names["train"], num_proc=4)

Loading cached processed dataset at /home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0/cache-fc2a97bfe4f2483e_*_of_00004.arrow
Loading cached processed dataset at /home/tesla/.cache/huggingface/datasets/vivos/default/1.1.0/ab59078eb266c1a0ea856786ba56b5b8d56f29b42dfb37d92115cf81a7b1a5e0/cache-b7f6150f2727bae7_*_of_00004.arrow


In [11]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [13]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  

Train


In [14]:
import evaluate

metric = evaluate.load("wer")


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [15]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")


In [16]:

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [17]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-base-vi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [18]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=vivos_clean["train"],
    eval_dataset=vivos_clean["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


/media/tesla/New Volume1/DEMO/DUY/Vietnamese_ASR/./whisper-base-vi is already a clone of https://huggingface.co/DuyTa/whisper-base-vi. Make sure you pull the latest changes with `repo.git_pull()`.


In [19]:
import torch

device_count = torch.cuda.device_count()

for device in range(device_count):
    torch.cuda.device(device)
    allocated_memory = torch.cuda.memory_allocated(device)
    peak_memory = torch.cuda.max_memory_allocated(device)
    print(f"Device {device}:")
    print(f"  Currently allocated memory: {allocated_memory / 1024**2} MB")
    print(f"  Peak memory usage: {peak_memory / 1024**2} MB")



Device 0:
  Currently allocated memory: 277.625 MB
  Peak memory usage: 277.625 MB


In [20]:
device_count = torch.cuda.device_count()

for device in range(device_count):
    properties = torch.cuda.get_device_properties(device)
    print(f"Device {device}:")
    print(f"  Name: {properties.name}")
    print(f"  Max Memory: {properties.total_memory / 1024**2} MB")


Device 0:
  Name: Tesla T4
  Max Memory: 14966.375 MB


In [21]:
trainer.train()



  0%|          | 0/4000 [00:00<?, ?it/s]

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 3.3127, 'learning_rate': 4.2000000000000006e-07, 'epoch': 0.03}
{'loss': 2.9406, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.07}
{'loss': 2.4168, 'learning_rate': 1.42e-06, 'epoch': 0.1}
{'loss': 1.866, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.14}
{'loss': 1.4583, 'learning_rate': 2.42e-06, 'epoch': 0.17}
{'loss': 1.1891, 'learning_rate': 2.92e-06, 'epoch': 0.21}
{'loss': 1.0304, 'learning_rate': 3.4200000000000007e-06, 'epoch': 0.24}
{'loss': 0.8943, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.27}
{'loss': 0.7826, 'learning_rate': 4.42e-06, 'epoch': 0.31}
{'loss': 0.733, 'learning_rate': 4.92e-06, 'epoch': 0.34}
{'loss': 0.652, 'learning_rate': 5.420000000000001e-06, 'epoch': 0.38}
{'loss': 0.6061, 'learning_rate': 5.92e-06, 'epoch': 0.41}
{'loss': 0.5485, 'learning_rate': 6.42e-06, 'epoch': 0.45}
{'loss': 0.5148, 'learning_rate': 6.92e-06, 'epoch': 0.48}
{'loss': 0.4782, 'learning_rate': 7.420000000000001e-06, 'epoch': 0.51}
{'loss': 0.4294, 'learnin

  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.29494285583496094, 'eval_wer': 32.038332038332044, 'eval_runtime': 216.8264, 'eval_samples_per_second': 3.505, 'eval_steps_per_second': 0.438, 'epoch': 1.37}
{'loss': 0.2101, 'learning_rate': 8.511428571428571e-06, 'epoch': 1.41}
{'loss': 0.194, 'learning_rate': 8.44e-06, 'epoch': 1.44}
{'loss': 0.2033, 'learning_rate': 8.36857142857143e-06, 'epoch': 1.47}
{'loss': 0.2314, 'learning_rate': 8.297142857142859e-06, 'epoch': 1.51}
{'loss': 0.214, 'learning_rate': 8.225714285714288e-06, 'epoch': 1.54}
{'loss': 0.1909, 'learning_rate': 8.154285714285715e-06, 'epoch': 1.58}
{'loss': 0.2083, 'learning_rate': 8.082857142857144e-06, 'epoch': 1.61}
{'loss': 0.2016, 'learning_rate': 8.011428571428573e-06, 'epoch': 1.65}
{'loss': 0.1957, 'learning_rate': 7.94e-06, 'epoch': 1.68}
{'loss': 0.1874, 'learning_rate': 7.86857142857143e-06, 'epoch': 1.71}
{'loss': 0.1851, 'learning_rate': 7.797142857142858e-06, 'epoch': 1.75}
{'loss': 0.1925, 'learning_rate': 7.725714285714286e-06, 'epoch'

  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.25480595231056213, 'eval_wer': 26.85832685832686, 'eval_runtime': 213.9548, 'eval_samples_per_second': 3.552, 'eval_steps_per_second': 0.444, 'epoch': 2.74}


Several commits (2) will be pushed upstream.


{'loss': 0.1212, 'learning_rate': 5.654285714285714e-06, 'epoch': 2.78}
{'loss': 0.1435, 'learning_rate': 5.582857142857143e-06, 'epoch': 2.81}
{'loss': 0.1351, 'learning_rate': 5.511428571428572e-06, 'epoch': 2.85}
{'loss': 0.128, 'learning_rate': 5.4400000000000004e-06, 'epoch': 2.88}
{'loss': 0.1225, 'learning_rate': 5.368571428571429e-06, 'epoch': 2.91}
{'loss': 0.1394, 'learning_rate': 5.297142857142858e-06, 'epoch': 2.95}
{'loss': 0.13, 'learning_rate': 5.225714285714286e-06, 'epoch': 2.98}
{'loss': 0.1185, 'learning_rate': 5.154285714285715e-06, 'epoch': 3.02}
{'loss': 0.0865, 'learning_rate': 5.082857142857144e-06, 'epoch': 3.05}
{'loss': 0.1001, 'learning_rate': 5.011428571428571e-06, 'epoch': 3.09}
{'loss': 0.0945, 'learning_rate': 4.94e-06, 'epoch': 3.12}
{'loss': 0.0959, 'learning_rate': 4.868571428571429e-06, 'epoch': 3.16}
{'loss': 0.0893, 'learning_rate': 4.797142857142857e-06, 'epoch': 3.19}
{'loss': 0.0925, 'learning_rate': 4.725714285714286e-06, 'epoch': 3.22}
{'loss'

  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.254891961812973, 'eval_wer': 25.343175343175346, 'eval_runtime': 212.1705, 'eval_samples_per_second': 3.582, 'eval_steps_per_second': 0.448, 'epoch': 4.12}


Several commits (3) will be pushed upstream.


{'loss': 0.0785, 'learning_rate': 2.797142857142857e-06, 'epoch': 4.15}
{'loss': 0.0732, 'learning_rate': 2.725714285714286e-06, 'epoch': 4.18}
{'loss': 0.0749, 'learning_rate': 2.654285714285714e-06, 'epoch': 4.22}
{'loss': 0.0742, 'learning_rate': 2.582857142857143e-06, 'epoch': 4.25}
{'loss': 0.0766, 'learning_rate': 2.5114285714285718e-06, 'epoch': 4.29}
{'loss': 0.0746, 'learning_rate': 2.4400000000000004e-06, 'epoch': 4.32}
{'loss': 0.08, 'learning_rate': 2.3685714285714285e-06, 'epoch': 4.36}
{'loss': 0.0778, 'learning_rate': 2.297142857142857e-06, 'epoch': 4.39}
{'loss': 0.0677, 'learning_rate': 2.2257142857142857e-06, 'epoch': 4.42}
{'loss': 0.0818, 'learning_rate': 2.1542857142857147e-06, 'epoch': 4.46}
{'loss': 0.0733, 'learning_rate': 2.0828571428571433e-06, 'epoch': 4.49}
{'loss': 0.0807, 'learning_rate': 2.0114285714285715e-06, 'epoch': 4.53}
{'loss': 0.0811, 'learning_rate': 1.94e-06, 'epoch': 4.56}
{'loss': 0.0671, 'learning_rate': 1.8685714285714289e-06, 'epoch': 4.6}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.2565304934978485, 'eval_wer': 25.058275058275058, 'eval_runtime': 212.5224, 'eval_samples_per_second': 3.576, 'eval_steps_per_second': 0.447, 'epoch': 5.49}


Several commits (4) will be pushed upstream.


{'train_runtime': 12408.1607, 'train_samples_per_second': 5.158, 'train_steps_per_second': 0.322, 'train_loss': 0.25067593112587927, 'epoch': 5.49}


TrainOutput(global_step=4000, training_loss=0.25067593112587927, metrics={'train_runtime': 12408.1607, 'train_samples_per_second': 5.158, 'train_steps_per_second': 0.322, 'train_loss': 0.25067593112587927, 'epoch': 5.49})

In [22]:
kwargs = {
    "dataset_tags": "vivos",
    "dataset": "Vivos",  # a 'pretty' name for the training dataset
    "language": "Vietnamese",
    "model_name": "Whisper Base Vi - Duy Ta",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-base",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}


In [23]:
trainer.push_to_hub(**kwargs)

Several commits (5) will be pushed upstream.
The progress bars may be unreliable.
batch response: Authorization error.
error: failed to push some refs to 'https://huggingface.co/DuyTa/whisper-base-vi'



OSError: batch response: Authorization error.
error: failed to push some refs to 'https://huggingface.co/DuyTa/whisper-base-vi'
