# Prepare Environment

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun May 19 14:06:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Install and Import Dependencies

In [2]:
# !pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [36]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Import Library

In [4]:
import os
import nltk
import torch
import librosa
import warnings
import evaluate
import numpy as np
import librosa.display
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import AutoConfig
from dataclasses import dataclass
from datasets import load_dataset, Audio
from typing import Any, Dict, List, Union
from transformers import WhisperForConditionalGeneration
from transformers import AutoConfig, WhisperTokenizerFast
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
curr_path = os.getcwd()
print(curr_path)

/content


# Load Dataset

In [7]:
minds_14 = load_dataset("PolyAI/minds14", "en-US")
# to download all data for multi-lingual fine-tuning uncomment following line

Downloading builder script:   0%|          | 0.00/5.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# see structure
print(minds_14)

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 563
    })
})


# Dataset & Model Preparation

In [9]:
minds_14 = minds_14['train']
only_us_eng = minds_14.remove_columns(['path', 'english_transcription', 'intent_class', 'lang_id'])
only_us_eng = only_us_eng.train_test_split(test_size=0.2)
only_us_eng

## Load Model without Load the Weights

In [12]:
model = "openai/whisper-tiny"
lang = "english"

In [15]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model)

# Create a configuration object for Whisper model
config = AutoConfig.from_pretrained("openai/whisper-tiny")

# Manually set any additional configurations if necessary
config.language = "English"
config.task = "transcribe"

# Initialize the tokenizer
tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", config=config, use_fast=True)

# Now you can use the tokenizer
text = "Hello, how are you?"
tokens = tokenizer(text)

print(tokens)

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [50258, 50363, 15947, 11, 577, 366, 291, 30, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
input_str = only_us_eng["train"][0]["transcription"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 can you tell me how I can deposit money into my bank account
Decoded w/ special:    <|startoftranscript|><|notimestamps|>can you tell me how I can deposit money into my bank account<|endoftext|>
Decoded w/out special: can you tell me how I can deposit money into my bank account
Are equal:             True


In [18]:
processor = WhisperProcessor.from_pretrained(model, language=lang, task="transcribe")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Dataset Preparation

In [19]:
print(only_us_eng["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~CASH_DEPOSIT/602b9ab55f67b421554f6392.wav', 'array': array([0.        , 0.        , 0.00024414, ..., 0.        , 0.00024414,
       0.        ]), 'sampling_rate': 8000}, 'transcription': 'can you tell me how I can deposit money into my bank account'}


In [20]:
only_us_eng = only_us_eng.cast_column("audio", Audio(sampling_rate=16000))

In [21]:
only_us_eng["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~CASH_DEPOSIT/602b9ab55f67b421554f6392.wav',
  'array': array([ 1.08841778e-05, -2.66862298e-05, -1.14246996e-05, ...,
          1.60163938e-04,  1.14277545e-05, -5.50200966e-05]),
  'sampling_rate': 16000},
 'transcription': 'can you tell me how I can deposit money into my bank account'}

In [22]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [25]:
only_us_eng = only_us_eng.map(prepare_dataset)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [26]:
only_us_eng

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 113
    })
})

# Training from scratch

In [37]:
model_hf = WhisperForConditionalGeneration.from_pretrained(model)
model_hf.generation_config.language = "english"
model_hf.generation_config.task = "transcribe"

model_hf.generation_config.forced_decoder_ids = None

In [38]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [39]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model_hf.config.decoder_start_token_id,
)

In [40]:
metric = evaluate.load("wer")

In [41]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-fine-tune",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [46]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model_hf,
    train_dataset=only_us_eng["train"],
    eval_dataset=only_us_eng["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs


In [47]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=500, training_loss=1.1694563052654265, metrics={'train_runtime': 1275.6867, 'train_samples_per_second': 6.271, 'train_steps_per_second': 0.392, 'total_flos': 1.9109178630144e+17, 'train_loss': 1.1694563052654265, 'epoch': 17.24137931034483})

# Inferencing

In [50]:
from transformers import pipeline, set_seed

In [63]:
# Access the trained model
trained_model = trainer.model

# Save the trained model
trained_model.save_pretrained("./trained_whisper_model")
processor.save_pretrained("./trained_whisper_model")

model = WhisperForConditionalGeneration.from_pretrained("./trained_whisper_model")
processor = WhisperProcessor.from_pretrained("./trained_whisper_model")

# Create ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

# Perform inference
audio_path = minds_14['path'][0]
result = asr_pipeline(audio_path)

print(f"Transcription: {result['text']}")

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcription:  I would like to set up a joint account with my partner how do I proceed with doing that


In [65]:
minds_14['transcription'][0]

'I would like to set up a joint account with my partner'

In [71]:
# Perform inference
audio_path = only_us_eng['test']['audio'][0]
result = asr_pipeline(audio_path)

print(f"Predicted Transcription: {result['text']}")

Predicted Transcription:  I like to know how soon the large payment can go through after I use the SMS code


In [70]:
print(f"Real Transcription: {only_us_eng['test']['transcription'][0]}")

Real Transcription: I like to know how to send a large payment can go through after I use


# Evaluation

In [72]:
test_set = only_us_eng['test'].to_pandas()

In [87]:
predicted_transcription = list()
for idx in test_set.index:
  audio = only_us_eng['test']['audio'][idx]
  result = asr_pipeline(audio)
  predicted_transcription.append(result['text'])

In [75]:
import editdistance

def calculate_wer(predicted_text, reference_text):
    distance = editdistance.eval(predicted_text.split(), reference_text.split())
    wer = distance / len(reference_text.split())
    return wer

def calculate_cer(predicted_text, reference_text):
    distance = editdistance.eval(predicted_text, reference_text)
    cer = distance / len(reference_text)
    return cer

from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

def calculate_bleu_score(predicted_text, reference_text):
    predicted_tokens = word_tokenize(predicted_text)
    reference_tokens = word_tokenize(reference_text)
    bleu_score = sentence_bleu([reference_tokens], predicted_tokens)
    return bleu_score

In [88]:
test_set['predicted_transcription'] = predicted_transcription

In [89]:
data_metrics = test_set[['predicted_transcription', 'transcription']]

## Word Error Rate

In [90]:
data_metrics

Unnamed: 0,predicted_transcription,transcription
0,I like to know how soon the large payment can...,I like to know how to send a large payment can...
1,can you please show me my latest transactions,can you please tell me my latest transactions
2,TF is not working,the app is not working
3,how do I change my address,how do I change my address
4,I lost my card and need to freeze my account,I lost my card and need to freeze my account
...,...,...
108,I would like to make a bill payment,I would like to make a down payment
109,show my latest transactions,show my latest transactions
110,hi I received a new card on a banking card an...,is it a new card and banking card and the card...
111,how much money can I withdraw per day from ATM,how much money can I withdraw per day from the...


In [91]:
data_metrics['wer'] = data_metrics.apply(lambda row: calculate_wer(row['predicted_transcription'], row['transcription']), axis=1)
print("WER:")
print(np.mean(data_metrics['wer']))

WER:
0.29420617060016735


## Character Error Rate

In [93]:
data_metrics['cer'] = data_metrics.apply(lambda row: calculate_cer(row['predicted_transcription'], row['transcription']), axis=1)
print("\nCER:")
print(np.mean(data_metrics['cer']))


CER:
0.24799302180001284


## Bleu Score

In [94]:
data_metrics['bleu_score'] = data_metrics.apply(lambda row: calculate_bleu_score(row['predicted_transcription'], row['transcription']), axis=1)
print("\nBLEU Score:")
print(np.mean(data_metrics['bleu_score']))


BLEU Score:
0.6920112764455526


# Base Model No Training

In [95]:
# Access the trained model
model = "openai/whisper-tiny"
lang = "english"

feature_extractor = WhisperFeatureExtractor.from_pretrained(model)
tokenizer = WhisperTokenizer.from_pretrained(model, language=lang, task="transcribe")
processor = WhisperProcessor.from_pretrained(model, language=lang, task="transcribe")

# Create ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model=model, tokenizer= tokenizer, feature_extractor=feature_extractor)

# Perform inference
audio_path = minds_14['path'][0]
result = asr_pipeline(audio_path)

print(f"Transcription: {result['text']}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Transcription:  I would like to set up a joint account with my partner. How do I proceed with doing that?


In [96]:
predicted_transcription = list()
for idx in test_set.index:
  audio = only_us_eng['test']['audio'][idx]
  result = asr_pipeline(audio)
  predicted_transcription.append(result['text'])

In [97]:
test_set['predicted_transcription_no_training'] = predicted_transcription
data_metrics = test_set[['predicted_transcription_no_training', 'transcription']]

In [98]:
data_metrics['wer'] = data_metrics.apply(lambda row: calculate_wer(row['predicted_transcription_no_training'], row['transcription']), axis=1)
print("WER:")
print(np.mean(data_metrics['wer']))

WER:
0.5132905390834379


In [99]:
data_metrics['cer'] = data_metrics.apply(lambda row: calculate_cer(row['predicted_transcription_no_training'], row['transcription']), axis=1)
print("\nCER:")
print(np.mean(data_metrics['cer']))


CER:
0.35944764376067057


In [100]:
data_metrics['bleu_score'] = data_metrics.apply(lambda row: calculate_bleu_score(row['predicted_transcription_no_training'], row['transcription']), axis=1)
print("\nBLEU Score:")
print(np.mean(data_metrics['bleu_score']))


BLEU Score:
0.5207995817748612
