In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import IPython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch.cuda.amp import autocast, GradScaler

import torchaudio
import torchaudio.functional as TAF
import torchaudio.transforms as T

from torchtext.data import get_tokenizer
from torchtext.data import load_sp_model, generate_sp_model

from IPython.display import Audio, display

from utils.dataset import CommonVoice
from utils.audio_utils import plot_waveform, play_audio
from utils.batch_utls import Collator
from utils.preprocess import Preprocessing

In [3]:
from typing import List, Dict, Union, Optional, Tuple

In [4]:
from torchvision import models

In [5]:
from transformers import AutoTokenizer

In [6]:
import os 
import pkbar

In [7]:
print(torch.__version__)
print(torchaudio.__version__)

1.11.0
0.11.0


In [8]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
datasetPATH = 'data/external/cv-corpus-8.0-2022-01-19/en/'
clipsPATH = os.path.join(datasetPATH, 'clips')

In [14]:
try: ##Check if tokenizer is defined
    tokenizer
except NameError as e: ## If tokenizer is not defined then initialize it
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
finally:
    blank_token = "<|blank|>"
    tokenizer.add_tokens(blank_token)
    
    blank_token_id = tokenizer.vocab[blank_token]
    
    vocab_size = len(tokenizer)

In [15]:
train_data = CommonVoice(dataset_path = datasetPATH, split_type = 'train', tokenizer = tokenizer, out_channels = 1)
train_data


    CommonVoice Dataset
    -------------------
    
    Loading train.tsv from /home/ashim/Projects/DeepSpeech/data/external/cv-corpus-8.0-2022-01-19/en directory.
        
    Number of Examples: 864448
    
    Args:
        Sampling Rate: 32000
        Output Channels: 1
    

In [16]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim: int = 128, num_heads: int = 4, ffn_dim: int = 128, num_layers: int = 4, depthwise_conv_kernel_size: int = 31, dropout: float = 0.3):
        
        super(Encoder, self).__init__()
        
        self.model = torchaudio.models.Conformer(input_dim = input_dim,
                                                 num_heads = num_heads,
                                                 ffn_dim = ffn_dim,
                                                 num_layers = num_layers,
                                                 depthwise_conv_kernel_size = depthwise_conv_kernel_size,
                                                 dropout = dropout)
        
    def forward(self, x: torch.Tensor, x_len: torch.Tensor) -> torch.Tensor:
        
        x, _ = self.model.forward(x, x_len)
        
        return x

In [17]:
class LSTMDecoder(nn.Module):
    
    def __init__(self, input_dim: int = 128, hidden_size: int = 256, num_layers: int = 2, bidirectional: bool = False, output_dim: int = None, padding_idx: int = None):
        
        super(LSTMDecoder, self).__init__()
        
        if output_dim == None:
            raise ValueError("Please specify the output size of the vocab.")
            
        directions = 2 if bidirectional == True else 1
            
        self.model = nn.GRU(input_size = input_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.ffn = nn.Linear(in_features = hidden_size * directions, out_features = output_dim)
                                
    def forward(self, x: torch.Tensor, hidden_state: torch.Tensor = None) -> torch.Tensor: 
        """
        Hidden state is needed, either in the form of encoder_hidden_state or decoder_hidden_state
        """
        
        if hidden_state == None:
            outputs, hidden_state = self.model(x, hidden_state)
        
        else:
            outputs, hidden_state = self.model(x, hidden_state)
        
        outputs = F.glu(self.ffn(outputs))

        return outputs, hidden_state

In [18]:
class Model(nn.Module):
    
    def __init__(self, encoder_input_dim: int = 128,
                encoder_num_heads: int = 4, 
                encoder_ffn_dim: int = 128, 
                encoder_num_layers: int = 4, 
                encoder_depthwise_conv_kernel_size: int = 31, 
                decoder_hidden_size:int = 128,
                decoder_num_layers: int = 2,
                bidirectional_decoder: bool = False,
                vocab_size: int = None,
                padding_idx: int = None,
                sos_token_id: int = None):
        
        super(Model, self).__init__()
        
        self.encoder = Encoder(input_dim = encoder_input_dim,
                              num_heads = encoder_num_heads,
                              ffn_dim = encoder_ffn_dim,
                              depthwise_conv_kernel_size = encoder_depthwise_conv_kernel_size)
        
        self.decoder = LSTMDecoder(input_dim = encoder_input_dim,
                                  hidden_size = decoder_hidden_size,
                                  num_layers = decoder_num_layers,
                                  bidirectional = bidirectional_decoder,
                                  output_dim = vocab_size)
        
        self.sos_token_id = sos_token_id
        
    def forward(self, x: torch.Tensor, x_lens: torch.Tensor):
        
        decoded = []
        
        bsz, msl, hdz = x.shape ##batch_size, max sequence length, hidden dimension size

        encoder_outputs = self.encoder(x, x_lens)
                
        decoder_inputs = encoder_outputs
        
        ## Start with the <sos> token
        x = torch.LongTensor([self.sos_token_id]).repeat(bsz).reshape(bsz, 1).to(device)

        for t in range(msl):
            
            if t == 0:
                decoder_output, decoder_hidden_state = self.decoder(x = decoder_inputs)            
            else:
                decoder_output, decoder_hidden_state = self.decoder(x = decoder_inputs, hidden_state = decoder_hidden_state)
            
            word = F.log_softmax(decoder_output, dim = -1) ## have to do log_softmax for CTC Loss
            
            topv, topi = decoder_output.topk(1)
            
            x = topv.squeeze().detach()
            
            decoded.append(topv)
            
        return encoder_outputs, torch.stack(decoded)

In [19]:
model_params = {
    'encoder_num_heads': 4,
    'encoder_ffn_dim': 64,
    'encoder_num_layers': 3,
    'decoder_num_layers': 1,
    'decoder_hidden_size': 64,
    'padding_idx': tokenizer.pad_token_id,
    'sos_token_id': tokenizer.bos_token_id
}

collator = Collator(tokenizer)
BATCH_SIZE = 1
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, collate_fn=collator, shuffle=True)

fp16 = True
scaler = GradScaler()

In [20]:
## CTC loss should be computed after the encoder outputs the probabilities

## Decoding part is usually decoupled from encoding part

In [21]:
model = Model(**model_params, vocab_size=vocab_size).to(device)

In [22]:
# model = Model(**model_params, vocab_size=vocab_size).to(device)
EPOCHS = 1
num_batches = len(train_loader)

enc_optim = torch.optim.AdamW(model.encoder.parameters(), lr = 3e-4)
loss_fn = nn.CTCLoss(blank = tokenizer.pad_token_id)

for epoch in range(EPOCHS):
    
    kbar = pkbar.Kbar(target = num_batches, epoch = epoch, num_epochs=EPOCHS, width = 10, always_stateful=False)
    
    enc_optim.zero_grad(set_to_none=True)
    
    for idx, batch in enumerate(train_loader):
        
        # waveforms = batch['waveforms']
        # waveforms_lengths = batch['waveforms_lengths']

        sentences = batch['sentences'].to(device)
        sentence_lengths = batch['sentence_lengths'].to(device)

        melspecs = batch['melspecs'].to(device)
        melspecs_lengths = batch['melspecs_lengths'].to(device)

        melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) from (batch, channel, n_mels, time)

        decoded = []

        bsz, msl, hdz = melspecs.shape ##batch_size, max sequence length, hidden dimension size

        encoder_outputs = model.encoder(melspecs, melspecs_lengths)

        ctc_loss = loss_fn(log_probs = encoder_outputs.transpose(1, 0), 
                           targets = sentences, 
                           input_lengths = melspecs_lengths.type(torch.int32), 
                           target_lengths=sentence_lengths.type(torch.int32))

        ctc_loss.backward()

        enc_optim.step()
        enc_optim.zero_grad(set_to_none=True)

        kbar.update(idx, values = [("ctc_loss", ctc_loss.detach().cpu().item())])

Epoch: 1/1
    14/864448 [..........] - ETA: 11:52:21 - ctc_loss: nan

RuntimeError: CUDA error: an illegal memory access was encountered

In [None]:
collator = Collator(tokenizer)
BATCH_SIZE = 1
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, collate_fn=collator, shuffle=True)

In [None]:
model_params = {
    'encoder_num_heads': 4,
    'encoder_ffn_dim': 128,
    'encoder_num_layers': 3,
    'decoder_num_layers': 1,
    'decoder_hidden_size': 128,
    'padding_idx': tokenizer.pad_token_id,
    'sos_token_id': tokenizer.bos_token_id
}

In [None]:
model = Model(**model_params, vocab_size=vocab_size).to(device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
loss_fn = nn.CTCLoss(blank = tokenizer.pad_token_id)

In [None]:
fp16 = True
scaler = GradScaler()

In [None]:
# def train_step(batch):

#     for t in range(msl):                        

#         outputs = F.glu(self.ffn(outputs))

#         word = F.log_softmax(outputs, dim = -1) ## have to do log_softmax for CTC Loss

#         decoder_inputs = outputs

#         decoded.append(word)


In [None]:
def train_step(loader, kbar):
    
    optimizer.zero_grad(set_to_none=True)
    
    for idx, batch in enumerate(loader):
        
        waveforms = batch['waveforms']
        waveforms_lengths = batch['waveforms_lengths']

        sentences = batch['sentences'].to(device)
        sentence_lengths = batch['sentence_lengths'].to(device)

        melspecs = batch['melspecs'].to(device)
        melspecs_lengths = batch['melspecs_lengths'].to(device)

        melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) from (batch, channel, n_mels, time)
        
        if fp16:
            
            with autocast():

                y = model.forward(melspecs, melspecs_lengths)

                ## CTC loss requires int32 and (T, B, L) shape for log_probabilities from decoder

                loss = loss_fn(log_probs = y.transpose(1, 0).type(torch.float32), 
                                  targets = sentences, 
                                  input_lengths = melspecs_lengths.type(torch.int32), 
                                  target_lengths=sentence_lengths.type(torch.int32))
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                
        else:
            
            y = model.forward(melspecs, melspecs_lengths)

            ## CTC loss requires int32 and (T, B, L) shape for log_probabilities from decoder
            
            loss = F.ctc_loss(log_probs = y.transpose(1, 0), 
                              targets = sentences, 
                              input_lengths = melspecs_lengths.type(torch.int32), 
                              target_lengths=sentence_lengths.type(torch.int32))
            
            loss.backward()

            optimizer.step()

        optimizer.zero_grad(set_to_none=True)
        
        kbar.update(idx, values = [("loss", loss.detach().cpu().item())])
        

In [21]:
EPOCHS = 2

In [22]:
num_batches = len(train_loader)

for epoch in range(EPOCHS):
    
    kbar = pkbar.Kbar(target = num_batches, epoch = epoch, num_epochs=EPOCHS, width = 10, always_stateful=False)
    
    train_step(train_loader, kbar)

Epoch: 1/2


NameError: name 'train_step' is not defined

In [None]:
sample = [train_data.__getitem__(1), train_data.__getitem__(2)]

In [None]:
sample

In [None]:
melspec = sample['melspec'].to(device)
melspec_len = torch.Tensor([melspec.shape[-1]]).to(device)

melspec= melspec.unsqueeze(0)
melspec = melspec.transpose(2, 1)

In [None]:
with torch.no_grad():
    y_preds = model(melspec, melspec_len)

In [None]:
y_preds.shape

In [None]:
topv, topi = y_preds.topk(1, dim = -1)

In [None]:
topi

In [None]:
topi.shape

In [None]:
with torch.no_grad():
    y_preds = model(melspec, melspec_len)
    y_preds = y_pred.argmax(dim = -1)
    y_preds = torch.unique_consecutive(y_preds)

In [None]:
y_preds

In [None]:
tokenizer(sample['sentence'], return_attention_mask=False)

In [None]:
tokenizer.decode(y_preds)

In [None]:
tokenizer.convert_ids_to_tokens(y_preds)

In [None]:
sample['sentence']

In [None]:
# melspec.shape

In [None]:
# melspec_len