In [None]:
# import os, sys 
# #to be able to interact with Google Drive's operating system
# from google.colab import drive 
# #drive is a module that allows us use Python to interact with google drive
# drive.mount('/content/gdrive') 
# #mounting google drive allows us to work with its contents
# nb_path = '/content/notebooks'
# os.symlink('/content/gdrive/My Drive/Colab Notebooks', nb_path)
# sys.path.insert(0, nb_path)  # or append(nb_path)
# #The last three lines are what changes the path of the file.

In [None]:
%pip install transformers datasets evaluate jiwer torch-audiomentations

In [None]:
# %cat /proc/cpuinfo

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_path = "facebook/wav2vec2-base-960h"
my_model_path = "./my_ASR_model"

num_proc = 4

## Load dataset

In [None]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
minds = minds.train_test_split(test_size=0.2)

In [None]:
minds

## Preprocess

In [None]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [None]:
minds["train"][0]

### resampling

In [None]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

### data augmentation

In [None]:
import numpy as np
from torch_audiomentations import (
    Compose, Gain, PitchShift, AddColoredNoise )
from torch import tensor

apply_gain = Compose(
    transforms=[
        Gain(
            p=0.4,
            min_gain_in_db=-18.0,
            max_gain_in_db=6.0,
        )
    ]
)
apply_pitch = Compose(
    transforms=[
        PitchShift(
            p=0.4, 
            min_transpose_semitones=-4, 
            max_transpose_semitones=4, 
            sample_rate=16000
          )
    ]
)
apply_noise = Compose(
    transforms=[
        AddColoredNoise(
            p=0.6, 
            min_snr_in_db=8, 
            max_snr_in_db=10,
            sample_rate=16000
          )
    ]
)

def apply_augment_dataset(batch):
    audio = batch["audio"]["array"]
    audio = audio[np.newaxis, np.newaxis,:]
    audio = tensor(audio)

    audio = apply_gain(audio)
    audio = apply_pitch(audio)
    audio = apply_noise(audio)
    batch["audio"]["array"] = np.array(audio[0,0,:])
    return batch

augmented_minds = minds.map(apply_augment_dataset, num_proc=num_proc)

In [None]:
from transformers import AutoProcessor, AutoTokenizer

processor = AutoProcessor.from_pretrained(model_path, num_proc=num_proc)
tokenizer = processor.tokenizer

In [None]:
processor

In [None]:
processor.tokenizer

In [None]:
#save tokenizer.
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)

In [None]:
# make transcription upper case
def uppercase(example):
    return {"transcription": example["transcription"].upper()}


minds = minds.map(uppercase)

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=num_proc)

In [None]:
encoded_minds

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:

    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

## Evaluate

In [None]:
import evaluate

wer = evaluate.load("wer")

def compute_metrics(pred):
    global wer
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    _wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": _wer}

## Train

In [None]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer, DistilBertConfig

#load model
config = DistilBertConfig(n_heads=16)
model = config.from_pretrained(
    model_path,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

In [None]:
config

In [None]:
training_args = TrainingArguments(
    output_dir=my_model_path,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_steps=500,
    eval_steps=100,
    logging_steps=25,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    greater_is_better=False,
    fp16=True,
    
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    group_by_length=True,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.push_to_hub()

## Inference

In [None]:
from datasets import load_dataset, Audio
from transformers import pipeline

dataset = load_dataset("mozilla-foundation/common_voice_11_0", "th", split='train')

transcriber  = pipeline(task="automatic-speech-recognition", model="DylanonWic/my_ASR_model")

#resampling
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate

In [None]:
index = np.random.randint(0, 563)
audio_arr = dataset[index]["audio"]['array'].flatten()

print(f'ind:{index}\n {dataset[index]["transcription"]}')

In [None]:
from IPython import display
display.Audio(audio_arr, rate=16000)

In [None]:
predict = transcriber(audio_arr)
predict