# Objective
In this notebook I am going to try and build a tool that can help me identify the flaws of my accent.
To do this, I'll need some labelled data: (voice recording and accent).

On HuggingFace I found this dataset https://huggingface.co/datasets/westbrook/English_Accent_DataSet which seems to be exactly what we needed!

Now let's see if we can classify some clips!

In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import random

device = 'cuda' if t.cuda.is_available() else 'cpu'
print(f"using device: {device}")

# Reproducibility -> Also ensures same train / validation / test split every time
t.manual_seed(42)
random.seed(42)
np.random.seed(42)

t.cuda.empty_cache()

using device: cuda


In [2]:
from datasets import load_dataset
from encodec import EncodecModel
from encodec.utils import convert_audio
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from IPython.display import Audio
from functools import lru_cache

class EnglishAccentDataset(Dataset):

    # List of accents, DO NOT CHANGE THE ORDER (taken from huggingface)
    accents = ['unknown', 'Dutch', 'German', 'Polish', 'French', 'Hungarian', 'Finnish', 'Romanian', 'Slovak', 'Spanish', 'Italian', 'Estonian', 'Lithuanian', 'Croatian', 'Slovene', 'English', 'Scottish', 'Irish', 'NorthernIrish', 'Indian', 'Vietnamese', 'Canadian', 'American']

    encodec = EncodecModel.encodec_model_24khz()
    encodec = encodec.to(device)
    encodec.eval()
    encodec.set_target_bandwidth(6.0)
    # Frozen Encodec Model
    for parameter in encodec.parameters():
        parameter.requires_grad_(False)

    def __init__(self, split = None):
        super().__init__()
        if split:
            assert split in ['train', 'validation', 'test']
        self.hf_dataset = load_dataset("westbrook/English_Accent_DataSet", split=split).with_format('torch')

    def __len__(self):
        return len(self.hf_dataset)
    
    def __getitem__(self, index):
        with t.no_grad():
            wav, sr = self.hf_dataset[index]['audio']['array'], self.hf_dataset[index]['audio']['sampling_rate'].item()
            target = self.hf_dataset[index]['accent']
            wav = wav.unsqueeze(0).unsqueeze(0)
            wav = convert_audio(wav, sr, EnglishAccentDataset.encodec.sample_rate, EnglishAccentDataset.encodec.channels)
            wav = wav.to(device)
            frames = EnglishAccentDataset.encodec.encode(wav)
            codes = frames[0][0]
        return codes.squeeze().cpu(), target

    @lru_cache(maxsize=1)
    def get_class_weights(self):
        ret = {}
        for row in self.hf_dataset:
            accent = EnglishAccentDataset.get_accent_from_label(row['accent'].item())
            ret[accent] = ret.get(accent, 0) + 1
        return ret
    
    def get_examples_from_class(self, accent):
        index = EnglishAccentDataset.get_label_from_accent(accent)
        examples = [i for i, row in enumerate(self.hf_dataset) if row['accent'].item() == index]
        return examples


    def get_accent_from_label(label: int):
        return EnglishAccentDataset.accents[label]

    def get_label_from_accent(accent: str):
        return EnglishAccentDataset.accents.index(accent)

    def decode_sequence(sequence):
        sequence = sequence.to(device)
        wav = EnglishAccentDataset.encodec.decode([(sequence, None)])
        wav = wav.squeeze().cpu().numpy()
        return Audio(wav, rate=EnglishAccentDataset.encodec.sample_rate)

train_dataset = EnglishAccentDataset('train')
valid_dataset = EnglishAccentDataset('validation')
test_dataset  = EnglishAccentDataset('test')

codes, label = test_dataset[0]
print(codes)
print(codes.shape)
print(t.max(codes))

  WeightNorm.apply(module, name, dim)
Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

tensor([[491, 837, 613,  ..., 408, 408, 408],
        [199, 722,  46,  ..., 518, 518, 913],
        [732, 908, 369,  ...,  36, 937, 937],
        ...,
        [ 64, 568, 455,  ..., 939, 435, 939],
        [356, 874, 725,  ..., 853, 570, 570],
        [334, 969, 145,  ..., 899, 948, 948]])
torch.Size([8, 933])
tensor(1023)


In [3]:
from torch.utils.data import DataLoader

# TODO: add padding and masking to allow batch sizes > 1
num_workers = 0
batch_size = 1
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [4]:
from IPython.display import display

for i in range(3):
    codes, target_label = next(iter(train_dataloader))
    print(target_label)
    print(EnglishAccentDataset.get_accent_from_label(target_label))
    print(codes.shape)
    display(EnglishAccentDataset.decode_sequence(codes))


tensor([15])
English
torch.Size([1, 8, 135])


tensor([1])
Dutch
torch.Size([1, 8, 356])


tensor([15])
English
torch.Size([1, 8, 321])


In [5]:
import random

ds = valid_dataset

examples = ds.get_examples_from_class('Italian')
choice = random.choice(examples)
codes, target = ds[choice]
print(target)
print(codes.shape)
display(EnglishAccentDataset.decode_sequence(codes.unsqueeze(0)))

tensor(10)
torch.Size([8, 870])


# The model

For the model, we opt for a transformer architecture.

For this particular case, we only need to use the encoder: basically like the **BERT** architecture.
Then we will get the embeddings of the final time step and feed them to a classifier that is going to predict the classes.

![transformer Architecture](./images/transformer.png "Transformer Architecture")

In [6]:
class AccentRecogniser(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=16, num_layers=12, ff_dim=512, dropout=0.2, device = 'cpu'):
        super().__init__()

        self.device = device
        self.input_dim = input_dim
        self.embedders = nn.ModuleList([nn.Embedding(1024, input_dim) for _ in range(8)]).to(device)
        
        # Transformer Encoder Layer
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_layers
        ).to(device)
        
        # Classifier head (fully connected layers)
        self.fc = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.ReLU(),
            nn.Linear(input_dim // 2, num_classes)
        ).to(device)

    def forward(self, x):
        # Note: For now the transformer has "infinite" window size because we are passing it the whole sequences.
        x = x.to(self.device)

        B, K, T = x.shape
        y = t.zeros([B, K, T, self.input_dim], device=self.device)
        for i in range(len(self.embedders)):
            y[:, i] = self.embedders[i](x[:, i])
        x = y # [B, K, T, input_dim]

        # Remove Codebook Dimension
        x = t.sum(x, dim=1) # [B, T, input_dim]

        x = self.transformer_encoder(x) # [B, T, input_dim]

        # Selecting the last element means we have the embedding that corresponds to the whole time series.
        x = x[:, -1, :] # [B, input_dim]

        x = self.fc(x) # [B, num_classes]

        return x
    
    def size(self):
        def human_format(num):
            magnitude = 0
            while abs(num) >= 1000:
                magnitude += 1
                num /= 1000.0
            # add more suffixes if you need them
            return "%.2f%s" % (num, ["", "K", "M", "G", "T", "P"][magnitude])

        return human_format(sum(p.numel() for p in self.parameters()))

model = AccentRecogniser(1024, num_classes=len(EnglishAccentDataset.accents), device = device)
print(model.size())

77.21M


In [7]:
def run_step(model, loss_fn, input, target):
    x = model(input)

    target = target.to(device)
    loss = loss_fn(x, target)
    return loss, x

In [8]:
weights = train_dataset.get_class_weights()
s = sum(weights.values())
weights = [s / (len(EnglishAccentDataset.accents) * weights[c]) for c in EnglishAccentDataset.accents]

for i, c in enumerate(EnglishAccentDataset.accents):
    print(f'{i}: {c} = {weights[i]}')

loss_fn = t.nn.CrossEntropyLoss(weight=t.Tensor(weights).to(device))
learning_rate = 0.000001
optimizer = t.optim.Adam(model.parameters(), learning_rate)
codes, targets = next(iter(valid_dataloader))

loss, predictions = run_step(model, loss_fn, codes, targets)
print(f'Example distribution on example 0: {F.softmax(predictions[0], dim = -1)}')
print(f'Actual target for value 0: {targets[0]}')
optimizer.zero_grad()
loss.backward()
grad_norm = 0.0
for param in model.parameters():
    if param.grad is not None:
        grad_norm += param.grad.norm(2).item() ** 2  # L2 norm for each parameter
grad_norm = grad_norm ** 0.5  # Square root to get the L2 norm

print("Gradient Norm:", grad_norm)
loss

0: unknown = 1.9081199818209362
1: Dutch = 2.049131654939602
2: German = 2.3378033501925666
3: Polish = 2.5530556400121616
4: French = 2.8975155279503104
5: Hungarian = 3.417350607067761
6: Finnish = 3.8362902611741414
7: Romanian = 3.961160468590298
8: Slovak = 5.689666854884246
9: Spanish = 1.8532332818362391
10: Italian = 3.055120975077315
11: Estonian = 8.235044132069303
12: Lithuanian = 12.517267080745341
13: Croatian = 18.254347826086956
14: Slovene = 35.330995792426364
15: English = 0.1621287646458763
16: Scottish = 0.29048159914208127
17: Irish = 0.5256831627382853
18: NorthernIrish = 0.8886497927506835
19: Indian = 1.6214076529462877
20: Vietnamese = 3.890802378562051
21: Canadian = 0.7318816368628248
22: American = 0.26230651887563583
Example distribution on example 0: tensor([0.0333, 0.0317, 0.0453, 0.0593, 0.0398, 0.0377, 0.0379, 0.0550, 0.0500,
        0.0279, 0.0462, 0.0421, 0.0488, 0.0494, 0.0495, 0.0489, 0.0461, 0.0466,
        0.0357, 0.0354, 0.0473, 0.0356, 0.0506], d

tensor(3.0670, device='cuda:0', grad_fn=<NllLossBackward0>)

In [9]:
print(valid_dataset.get_class_weights())
print(test_dataset.get_class_weights())

{'English': 299, 'Polish': 18, 'Indian': 32, 'Italian': 14, 'unknown': 19, 'Spanish': 31, 'Scottish': 152, 'Irish': 75, 'Vietnamese': 13, 'Romanian': 10, 'American': 165, 'NorthernIrish': 43, 'Canadian': 68, 'Estonian': 10, 'Slovak': 7, 'German': 18, 'French': 11, 'Dutch': 17, 'Hungarian': 17, 'Croatian': 5, 'Slovene': 3, 'Finnish': 8, 'Lithuanian': 1}
{'English': 442, 'Polish': 31, 'Indian': 51, 'Italian': 29, 'Dutch': 27, 'unknown': 35, 'Spanish': 46, 'Scottish': 235, 'Irish': 152, 'Vietnamese': 14, 'Romanian': 23, 'French': 26, 'American': 263, 'NorthernIrish': 73, 'Canadian': 90, 'Hungarian': 18, 'Finnish': 11, 'Slovak': 17, 'Croatian': 2, 'German': 19, 'Estonian': 13, 'Slovene': 1, 'Lithuanian': 2}


In [10]:
import os


def get_unique_experiment_name():
    return model.__class__.__name__ + "_" + model.size()


models_dir = "/home/andreacacioli/Documents/github/MachineLearning/Projects/EnglishAccentDetection/models/"
checkpoints = os.listdir(models_dir)
unique_name = get_unique_experiment_name()
checkpoints = [
    f for f in checkpoints if f[: min(len(unique_name), len(f))] == unique_name
]
if len(checkpoints) == 0:
    print("Starting from fresh: no models to load")
    epoch = 0
else:
    checkpoints.sort(key=lambda x: int(x[x.index("-") + 1 : -3]), reverse=True)

    epoch = int(checkpoints[0][checkpoints[0].index("-") + 1 : -3])
    print(f"Loading from {models_dir + checkpoints[0]}")
    model.load_state_dict(t.load(models_dir + checkpoints[0], weights_only=True))
    print(f"Loaded epoch {epoch}")

Starting from fresh: no models to load


In [11]:
# Use Wandb
import wandb

wandb.login()
wandb.init(
    project="English Accent Recognizer - Transformer Encoder",
    config={
        "total_parameters": model.size(),
        "backbone_size": "m",
        "learning_rate": learning_rate,
        "emb_size": model.input_dim,
        "dataset": "HAM10000Dataset",
        "batch_size": batch_size,
        "classes": len(EnglishAccentDataset.accents),
        "device": device
    },
)
wandb.watch(model, log='all', log_freq=1)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mandrea-cacioli[0m ([33mandrea-cacioli-education[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
from torcheval.metrics import MulticlassAccuracy, MulticlassAUROC, MulticlassRecall, MulticlassPrecision
from torchmetrics.classification import MulticlassConfusionMatrix
from tqdm import tqdm


def get_metrics(num_classes, dataloader, verbose = True):
    metrics =  {
        'accuracy': MulticlassAccuracy(num_classes=num_classes),
        'precision' : MulticlassPrecision(num_classes=num_classes),
        'recall': MulticlassRecall(num_classes=num_classes),
        'AUC': MulticlassAUROC(num_classes=num_classes),
        }
    all_logits = []
    all_targets = []
    all_losses = []
    confusion = MulticlassConfusionMatrix(num_classes=num_classes)
    with t.no_grad():
        model.eval()
        it = tqdm(dataloader) if verbose else dataloader
        for codes, targets in it:
            loss, logits = run_step(model, loss_fn, codes, targets)
            for _, metric in metrics.items():
                metric.update(logits, targets)
                confusion.update(logits.cpu(), targets.cpu())
                all_logits += logits.tolist()
                all_targets += targets.tolist()
                all_losses += [loss.item()]
    return all_logits, all_targets, all_losses, metrics, confusion

In [None]:
from tqdm import tqdm
from IPython.display import clear_output
import copy


EVAL_EVERY = 10000 # Only running evaluation step every EVAL_EVERY training steps
epochs = epoch + 2 # Set this to the number of training epochs you want to perform

with t.no_grad(): # To initialize the eval loss before training
    model.eval()
    codes, targets = next(iter(valid_dataloader))
    loss, _ = run_step(model, loss_fn, codes, targets)
    loss = loss.item()
    best_eval_loss = loss
    best_epoch = -1
    model.train()


while epoch < epochs:
    i = 1
    print(f'Epoch: {epoch}')
    for codes, targets in tqdm(train_dataloader):
        #Training Step
        optimizer.zero_grad()
        loss, _ = run_step(model, loss_fn, codes, targets)
        loss.backward()
        optimizer.step()

        wandb.log({"train_loss": loss.item()})


        if i == 0:
            #Validation Step
            print(f"Evaluating: Epoch {epoch}")
            all_logits, all_targets, all_losses, metrics, confusion = get_metrics(len(EnglishAccentDataset.accents), valid_dataloader)
            loss = sum(all_losses) / len(all_losses)
            metrics = {metric_name: metric.compute() for metric_name, metric in metrics.items()}
            wandb.log({"eval_loss": loss})
            wandb.log(metrics)
            if loss < best_eval_loss: # Getting the best model
                best_eval_loss = loss
                best_model = copy.deepcopy(model)
                best_epoch = epoch
            model.train()
            clear_output()
            print(f"eval_loss: {loss}")

        i = (i + 1) % EVAL_EVERY

    
    clear_output()
    print(f"Current Best Epoch: {best_epoch}: Validation Loss: {best_eval_loss}")
    epoch += 1

print(f"Going back to epoch {best_epoch}: validation loss: {best_eval_loss}")
model = best_model
epoch = best_epoch
wandb.finish()

Epoch: 0


  0%|          | 45/50382 [00:11<3:25:44,  4.08it/s]


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [None]:
t.save(
    model.state_dict(),
    f"/home/andreacacioli/Documents/github/MachineLearning/Projects/EnglishAccentDetection/models/{get_unique_experiment_name()}_epoch-{epoch}.pt",
)

# Testing

In [None]:
all_logits, all_targets, all_losses, metrics, confusion = get_metrics(len(EnglishAccentDataset.accents), test_dataloader)

100%|██████████| 1620/1620 [02:59<00:00,  9.04it/s]


In [None]:
for metric_name, metric in metrics.items():
    print(f'{metric_name}: {metric.compute()}')

accuracy: 0.035185184329748154
precision: 0.035185184329748154
recall: 0.035185184329748154
AUC: 0.5294334292411804
