In [2]:
!pip install datasets
!pip install evaluate
!pip install hopsworks
!pip install jiwer
!pip install transformers[torch]
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.5.2
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [3]:
import torch
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, load_dataset
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
import evaluate
import hopsworks
from huggingface_hub import login

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
with open("/content/drive/MyDrive/token.txt", "r") as file:
    token = file.readline().strip()

login(token=token, add_to_git_credential=True)
#project = hopsworks.login()

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [7]:
# Hopsworks
# def createDirectoryIfNotExists(directory_name):
#     if not os.path.exists(directory_name):
#         os.mkdir(directory_name)
#
# createDirectoryIfNotExists("common_voice")
# createDirectoryIfNotExists("common_voice/train")
# createDirectoryIfNotExists("common_voice/test")
#
# dataset_api = project.get_dataset_api()
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/train/data-00000-of-00001.arrow", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/train/", overwrite=True)
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/train/dataset_info.json", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/train/", overwrite=True)
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/train/state.json", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/train/", overwrite=True)
#
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/test/data-00000-of-00001.arrow", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/test/", overwrite=True)
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/test/dataset_info.json", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/test/", overwrite=True)
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/test/state.json", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/test/", overwrite=True)
# common_voice = dataset_api.download(
#     "Lab1_ID2222_Training_Datasets/common_voice/dataset_dict.json", local_path=os.path.dirname(os.getcwd())+"/training/common_voice/", overwrite=True)
#
# common_voice = DatasetDict.load_from_disk("common_voice")

In [8]:
#HuggingFace
common_voice = DatasetDict()
common_voice["train"] = load_dataset("dacavi/hindi-dataset", split="train", use_auth_token=True)
common_voice["test"] = load_dataset("dacavi/hindi-dataset", split="test", use_auth_token=True)
print(common_voice)



Downloading readme:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/79.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/98.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/64.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/67.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/79.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/82.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/82.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/6540 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2894 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 6540
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2894
    })
})


In [9]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [10]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

In [14]:
training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir="dacavi/whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=1100,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_strategy ="steps", #
    save_steps=100, #
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    overwrite_output_dir=False, #
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,

)

In [15]:
processor.save_pretrained(training_args.output_dir)

In [16]:
print("training is starting")
trainer.train(resume_from_checkpoint=True)

training is starting


ValueError: ignored

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "dataset_args": "config: hi, split: test",
    "language": "hi",
    "model_name": "Whisper Small Hi - Swedish",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}

trainer.push_to_hub(**kwargs)

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]