# Wav2vec model using mozilla commonvoice [EN] dataset

### Loading data

In [1]:
import os
import re
import random
import json
import pandas as pd
from datasets import load_dataset
import IPython.display as ipd
from IPython.display import HTML

#### Load audio dataset

In [2]:
cv_corpus_dataset = load_dataset("csv", data_files="../../data/cv-corpus-13.0-delta-2023-03-09/en/validated.tsv", sep="\t", header=0)
ds_train_test = cv_corpus_dataset["train"].train_test_split(test_size=0.2)
ds_train_test = ds_train_test.remove_columns(["client_id" ,"up_votes", "down_votes", "age", "gender", "accents", "variant", "locale", "segment"])

In [3]:
ds_train_test

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 5586
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 1397
    })
})

#### Convert text to lowercase

In [4]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    ipd.display(HTML(df.to_html()))

show_random_elements(ds_train_test["train"].remove_columns(["path"]))

Unnamed: 0,sentence
0,"And another album, ""Back in Love Again""."
1,Radcliff has two public secondary schools within its city limits.
2,Sessions form the bulk of the activity at the Congress.
3,This provided more passenger seating and comfort.
4,A portion of the feast is set aside for them.
5,"He picked up four podiums, but no win."
6,They are gendered female.
7,There are several benefits to this technique.
8,He must abstain from every other act that involves personal impurity.
9,This man is the most powerful person in the village.


#### Remove special characters

In [5]:
chars_to_ignore_regex = r'[()“”‘’",.;:?!‑–—\-]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    return batch

ds_train_test = ds_train_test.map(remove_special_characters)

Map:   0%|          | 0/5586 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

In [6]:
show_random_elements(ds_train_test["train"].remove_columns(["path"]))

Unnamed: 0,sentence
0,bruno mathsson and dan ihreborn are well known for their furniture designs
1,conjecturally in each of the low temperature extremal states the truncated correlations decay algebraically
2,any member of the community may try for a spot on the schedule
3,only one single rise was officially released and had a music video
4,the show failed terribly and was cancelled after only four episodes
5,outgassing can occur from two sources surfaces and bulk materials
6,she is horrified
7,licensed to sault ste
8,it would be very prejudicial to his dignity and holiness to touch the ground
9,there have been four reported stellar occultation events by irene


#### Extract all characters

In [7]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


def extract_all_chars_to_json(dataset):
    vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])
    vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
    vocab_list.sort()
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    print("Number of characters:", len(vocab_dict))
    print("Characters:", vocab_dict.keys())
    with open("../../modules/models/vocabs/wav2vec_cvcorpus_en.json", 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)

extract_all_chars_to_json(ds_train_test)

Map:   0%|          | 0/5586 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

Number of characters: 30
Characters: dict_keys(["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '[UNK]', '[PAD]'])


#### Convert file path to local absolute path

In [24]:
path_to_audio = os.path.abspath("../../data/cv-corpus-13.0-delta-2023-03-09/en/clips")
def abs_path_to_file(batch):
    batch["path"] = os.path.join(path_to_audio,batch["path"])
    return batch

ds_train_test = ds_train_test.map(abs_path_to_file)

Map:   0%|          | 0/5586 [00:00<?, ? examples/s]

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

### Set up processor

In [27]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

#### Tokenizer

In [28]:
tokenizer = Wav2Vec2CTCTokenizer(
    "../../modules/models/vocabs/wav2vec_cvcorpus_en.json",
    unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

#### Feature extractor

In [29]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, 
    padding_value=0.0, do_normalize=True, 
    return_attention_mask=False)

#### Processor

In [30]:
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer)

### Audio preview

In [34]:
import torch
import torchaudio
from torchaudio.functional import resample

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_sample_rate = 16_000

#### Load random audio file from dataset

In [37]:
random_index = random.randint(0, len(ds_train_test["train"]) - 1)
speech_file = ds_train_test["train"]["path"][random_index]
print("Sentence:", ds_train_test["train"]["sentence"][random_index])
ipd.Audio(speech_file)

Sentence: the party leader in both periods was jeanpaul poulin


#### Resampling to match model sample rate

In [38]:
waveform, sample_rate = torchaudio.load(speech_file)
waveform = waveform.to(device)

if sample_rate != model_sample_rate :
    waveform = torchaudio.functional.resample(waveform, sample_rate, model_sample_rate)
ipd.Audio(data=waveform[0].tolist(), rate=model_sample_rate)

### Preprocessing

In [39]:
def prepare_dataset(batch):
    # Get path to audio file
    audio_path = batch["path"]

    # Convert audio to waveform
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample audio to match model's
    if sample_rate != model_sample_rate :
        waveform = torchaudio.functional.resample(waveform, sample_rate, model_sample_rate)
   
    # Pass audio data to preprocessor
    batch["input_values"] = processor(waveform[0].tolist(), sampling_rate=model_sample_rate).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [40]:
ds_train_test = ds_train_test.map(prepare_dataset, remove_columns=ds_train_test.column_names["train"])

Map:   0%|          | 0/5586 [00:00<?, ? examples/s]



Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

### Train the model

#### Data Collator CTC

In [42]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [45]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

#### Word Error Rate metric

In [49]:
import numpy as np
import evaluate

wer_metric = evaluate.load("wer", trust_remote_code=True)

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)

    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [29]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)
model.freeze_feature_encoder()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Setting up Trainer

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../models",
    group_by_length=True,
    per_device_train_batch_size=32,
    eval_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    gradient_checkpointing=True, 
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
)

In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds_train_test["train"],
    eval_dataset=ds_train_test["test"],
    tokenizer=processor.feature_extractor,
)

In [33]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
500,4.1336,2.944428,1.0
1000,1.2733,0.547736,0.420211
1500,0.3584,0.474222,0.321155
2000,0.221,0.474642,0.294961
2500,0.16,0.494002,0.287172
3000,0.1226,0.522622,0.27504
3500,0.1015,0.528169,0.270628
4000,0.0826,0.570273,0.259668
4500,0.0732,0.55685,0.259875
5000,0.0637,0.566321,0.255394




TrainOutput(global_step=5250, training_loss=0.6305151594252814, metrics={'train_runtime': 29939.8589, 'train_samples_per_second': 5.597, 'train_steps_per_second': 0.175, 'total_flos': 8.496986566585836e+18, 'train_loss': 0.6305151594252814, 'epoch': 30.0})

#### Save model

In [35]:
model_dir_path = "../models/wav2vec_cvcorpus_en"
if not os.path.isdir(model_dir_path):
    os.mkdir(model_dir_path)
trainer.save_model("../models/wav2vec_cvcorpus_en")
processor.save_pretrained("../models/wav2vec_cvcorpus_en")

### Load trained model

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("../models/wav2vec_cvcorpus_en").to(device)
processor = Wav2Vec2Processor.from_pretrained("../models/wav2vec_cvcorpus_en")

In [None]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_str"] = processor.batch_decode(pred_ids)[0]
        batch["text"] = processor.decode(batch["labels"], group_tokens=False)
    
    return batch

results = ds_train_test["test"].map(map_to_result, remove_columns=ds_train_test["test"].column_names)

In [None]:
print(f"Test WER: {wer_metric.compute(predictions=results["pred_str"], references=results["text"]):.3f}")

In [None]:
show_random_elements(results)