In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR
from torch.optim import RAdam

from torch.utils.data import Dataset, DataLoader

from torch.nn.utils import clip_grad_norm_

from torch.cuda.amp import autocast, GradScaler

from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, PackedSequence

import torchaudio

from src.model.model import *

from src.utils.dataset import CommonVoice
from src.utils.audio_utils import plot_waveform, play_audio
from src.utils.collate import Collator
from src.utils.tokenizer import get_tokenizer

from transformers import PreTrainedTokenizerFast

from datetime import datetime

ModuleNotFoundError: No module named 'src'

In [2]:
from typing import Tuple, List, Dict

In [None]:
from src.utils.misc import get_summary, get_writer
from src.utils.grad_flow import *

ImportError: attempted relative import with no known parent package

In [4]:
import os
import random
import pkbar

In [5]:
seed = 0 

g = torch.Generator()
g.manual_seed(seed)

torch.manual_seed(seed)

random.seed(seed)

In [6]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
dataset_dir = 'data/external/cv-corpus-8.0-2022-01-19/en/'

tokenizer_file = 'data/tokenizer/trained_tokenizer.json'

# trimmed_train_path = 'data/internal/sample_train.tsv'
trimmed_train_path = 'data/internal/train_trimmed.tsv'

In [8]:
tokenizer = get_tokenizer(tokenizer_file_path=tokenizer_file)

blank_token_id = tokenizer.vocab["[BLANK]"]
bos_token_id = tokenizer.vocab["[BOS]"]
eos_token_id = tokenizer.vocab["[EOS]"]
vocab_size = tokenizer.vocab_size

In [9]:
train_data = CommonVoice(dataset_dir = dataset_dir, subset_path = trimmed_train_path, tokenizer = tokenizer, out_channels = 1)
# train_data = CommonVoice(dataset_dir = dataset_dir, subset_name = 'train', tokenizer = tokenizer, out_channels = 1)

dev_data = CommonVoice(dataset_dir = dataset_dir, subset_name = 'dev', tokenizer = tokenizer, out_channels = 1)

print(train_data)

print(dev_data)


    CommonVoice Dataset
    -------------------
    
    Loading None.tsv from /home/ashim/Projects/DeepSpeech/data/external/cv-corpus-8.0-2022-01-19/en directory.
        
    Number of Examples: 864308
    
    Args:
        Sampling Rate: 16000
        Output Channels: 1
    

    CommonVoice Dataset
    -------------------
    
    Loading dev.tsv from /home/ashim/Projects/DeepSpeech/data/external/cv-corpus-8.0-2022-01-19/en directory.
        
    Number of Examples: 16326
    
    Args:
        Sampling Rate: 16000
        Output Channels: 1
    


In [10]:
model_params = {
    'encoder_input_size': 80,
    'conformer_num_heads': 4,
    'conformer_ffn_size': 512,
    'conformer_num_layers': 16,
    'conformer_conv_kernel_size': 31,
    'encoder_rnn_hidden_size': 1024,
    'encoder_rnn_num_layers': 1,
    'encoder_rnn_bidirectional': True,
    'decoder_embedding_size': 300,
    'decoder_hidden_size': 1024,
    'decoder_num_layers': 1,
    'decoder_attn_size': 144,
    'dropout': 0.3,
    'padding_idx': tokenizer.pad_token_id,
    'sos_token_id': tokenizer.bos_token_id,
    'eos_token_id': tokenizer.eos_token_id,
    'vocab_size': vocab_size,
    'batch_first': True,
    'device': device,
}

In [11]:
collator = Collator(tokenizer, special_tokens = False)
BATCH_SIZE = 64

train_loader = DataLoader(train_data, 
                          batch_size = BATCH_SIZE, 
                          collate_fn=collator, 
                          shuffle=True, 
                          pin_memory = False, 
                          num_workers = 6, 
                          worker_init_fn = collator.seed_worker, 
                          generator = g)

fp16 = False
scaler = GradScaler()

In [12]:
class CTCModel(nn.Module):
    
    def __init__(self, vocab_size: int):
        
        super(CTCModel, self).__init__()
        
        self.encoder = torchaudio.models.Conformer(input_dim = 80, num_heads = 16, ffn_dim = 512, num_layers = 12, depthwise_conv_kernel_size=31)
        
        self.rnn = nn.GRU(input_size = 80, hidden_size = vocab_size)
        
    def forward(self, x, x_lens) -> torch.Tensor:
        
        x, x_lens = self.encoder(x, x_lens)
        
        x, hidden = self.rnn(F.relu(x))
        x = F.log_softmax(x, dim = -1)## ctc loss requires log_softmax
        
        return x

In [13]:
model = CTCModel(vocab_size=vocab_size).to(device)
# model = Model(**model_params).to(device)

In [14]:
# get_summary(encoder, dataloader = train_loader)

In [15]:
## CTC loss should be computed after the encoder outputs the probabilities

## Decoding part is usually decoupled from encoding part

In [16]:
base_log_dir = 'logs/'
writer = get_writer(base_log_dir=base_log_dir, comment = "CTC Loss")
# writer = None

In [17]:
def predict_one_batch(model: torch.nn.Module, batch: Dict, max_len: int = 50):
    
    model.eval()
    
    melspecs = batch['melspecs'].to(device).squeeze(0)
    melspecs_lengths = batch['melspecs_lengths'].to(device, dtype = torch.int32)
    
    sentences = batch['sentences'].to(device)
    sentence_lengths = batch['sentence_lengths'].to(device=device, dtype = torch.int32)    
    
    with torch.no_grad():
        sort_indices = torch.argsort(sentence_lengths, descending=True)
        
        melspecs = melspecs[sort_indices]
        melspecs_lengths = melspecs_lengths[sort_indices]
        
        sentences = sentences[sort_indices]
        sentence_lengths = sentence_lengths[sort_indices]
        
        melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) from (batch, channel, n_mels, time)

        y_preds = model.forward(melspecs, melspecs_lengths)

        y_ids = y_preds.argmax(dim = -1)
        y_pred = torch.unique_consecutive(y_ids, dim = 1)
        
        y_pred = tokenizer.batch_decode(y_pred)
    
    y_true = tokenizer.batch_decode(sentences)
    return y_true, y_pred

In [18]:
%time
try:
    samples
except NameError as e:
    samples = next(iter(train_loader))

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.58 µs


In [19]:
EPOCHS = 200
lr = 0.01 # learning rate

MAX_NORM = 0.5

num_batches = len(train_loader)

# optimizer = torch.optim.SGD(model.parameters(), lr = lr)
optimizer = torch.optim.RAdam(model.parameters(), lr = lr)
# optimizer = torch.optim.RAdam(model.parameters())

epoch_end_scheduler = ReduceLROnPlateau(optimizer, mode = 'min', patience = 2)

cawr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2, T_mult=2)

batch_end_scheduler = StepLR(optimizer, 1.0, gamma=0.95)

criterion = nn.CTCLoss(blank = tokenizer.vocab['[BLANK]'], 
                       zero_infinity=True,
                       reduction = 'mean')

In [20]:
def train_step(batch: List[Dict[str, torch.Tensor]], n_iter: int, MAX_NORM: float = 0.5, plot_gradients: bool = True):
    
    model.train()
    optimizer.zero_grad(set_to_none = True)
    
    sentences = batch['sentences'].to(device)
    sentence_lengths = batch['sentence_lengths'].to(device, dtype = torch.int32)

    melspecs = batch['melspecs'].to(device)
    melspecs_lengths = batch['melspecs_lengths'].to(device, dtype = torch.int32)

    sort_indices = torch.argsort(sentence_lengths, descending=True)

    melspecs = melspecs[sort_indices]
    melspecs_lengths = melspecs_lengths[sort_indices]

    sentences = sentences[sort_indices]
    sentence_lengths = sentence_lengths[sort_indices]

    melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) from (batch, channel, n_mels, time)

    y_preds = model.forward(melspecs, melspecs_lengths)
    loss = criterion(y_preds.permute(1, 0, 2), sentences, melspecs_lengths, sentence_lengths)
    
    loss.backward()

    ## Plot Gradients every 10 steps
    if n_iter % 10 == 0 and plot_gradients == True:

        grad_flow_fig = plot_grad_flow_v2(model.named_parameters())
    
    else:
        grad_flow_fig = None
    
    ## Gradient Clipping for exploding gradients
    clip_grad_norm_(model.parameters(), max_norm = MAX_NORM)

    ## Step the optimizers
    optimizer.step()

    ## Step the schedulers
    batch_end_scheduler.step()

    return loss.detach().cpu().item(), grad_flow_fig

In [21]:
# with torch.no_grad():
#     for idx, batch in enumerate(train_loader):
#         batch = batch
#         sentences = batch['sentences'].to(device)
#         sentence_lengths = batch['sentence_lengths'].to(device, dtype = torch.int32)

#         melspecs = batch['melspecs'].to(device)
#         melspecs_lengths = batch['melspecs_lengths'].to(device, dtype = torch.int32)
                
#         sort_indices = torch.argsort(sentence_lengths, descending=True)
        
#         melspecs = melspecs[sort_indices]
#         melspecs_lengths = melspecs_lengths[sort_indices]
        
#         sentences = sentences[sort_indices]
#         sentence_lengths = sentence_lengths[sort_indices]
        
#         melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) from (batch, channel, n_mels, time)

#         y_preds = model.forward(melspecs, melspecs_lengths)
#         loss = criterion(y_preds.permute(1, 0, 2), sentences, melspecs_lengths, sentence_lengths)
                
#         break

In [22]:
n_iter = 0

for epoch in range(EPOCHS):
    
    kbar = pkbar.Kbar(target = num_batches, epoch = epoch, num_epochs=EPOCHS, width = 8, always_stateful=False)
    
    for idx, batch in enumerate(train_loader):
        
        optimizer.zero_grad(set_to_none=True)

        loss, grad_flow_fig = train_step(batch, n_iter, plot_gradients=True)
        
        ## Write how sample is being predicted
        ##predict_one_batch uses no grad
        sample_true, sample_pred = predict_one_batch(model, samples)
        writer.add_text('sentence predictions', f'true sentence: {sample_true[0]}, predicted sentence: {sample_pred[0]}', global_step = n_iter)
        
        writer.add_scalar('CE Loss/train', loss, n_iter)
        
        if grad_flow_fig != None:
            
            writer.add_figure('Average Gradients/Model', grad_flow_fig, global_step = n_iter, close = True)

        kbar.update(idx, values = [("loss", loss)])

        n_iter += 1
        
    
    ## At epoch end
    
    # cawr_scheduler.step() ##cosine annealing with warm restarts
    epoch_end_scheduler.step(loss)
    
    print("\n")

Epoch: 1/200
 4219/13505 [=>......] - ETA: 42:29 - loss: 10.0525parameter shape is: torch.Size([80, 1, 31]), parameter name is: encoder.conformer_layers.5.conv_module.sequential.2.weight


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff1215cef70>
Traceback (most recent call last):
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1322, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/multiprocessing/connection.py", line 936, in wait
    ready = selector.select(timeout)
  File "/home/ashim/miniconda3/envs/speech/lib/python3.9/selectors.py", line 416, in select
    fd_