In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import IPython

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchaudio
import torchaudio.functional as TAF
import torchaudio.transforms as T

from torchtext.data import get_tokenizer
from torchtext.data import load_sp_model, generate_sp_model

from IPython.display import Audio, display

# from data_utils import CommonVoice
from utils.audio_utils import plot_waveform, play_audio

In [128]:
from typing import List, Dict, Union

In [3]:
from torchvision import models

In [4]:
from transformers import AutoTokenizer

In [5]:
import os 

In [6]:
print(torch.__version__)
print(torchaudio.__version__)

1.11.0
0.11.0


In [7]:
train_df = pd.read_csv('data/external/cv-corpus-8.0-2022-01-19/en/train.tsv', sep='\t')

In [8]:
train_df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,4225d310188cfd17c5901d29ca5d9685eac936287ed275...,common_voice_en_28449980.mp3,Thereafter the class was highly respected.,2,0,,,,en,
1,4225d310188cfd17c5901d29ca5d9685eac936287ed275...,common_voice_en_28449981.mp3,Banaras Hindu University is a Central Universi...,2,0,,,,en,
2,4225d310188cfd17c5901d29ca5d9685eac936287ed275...,common_voice_en_28449984.mp3,"On display are home furnishings, pioneer tools...",2,0,,,,en,
3,4225d310188cfd17c5901d29ca5d9685eac936287ed275...,common_voice_en_28449986.mp3,Eleva and Strum each house an elementary school.,2,1,,,,en,
4,425089f4d7e24cdf6861d0130323ec2e41bfc19e35bce5...,common_voice_en_20293200.mp3,The eastern portion of the county lies within ...,2,0,,,,en,


In [9]:
train_df.describe()

Unnamed: 0,up_votes,down_votes,segment
count,864448.0,864448.0,0.0
mean,2.140156,0.173734,
std,0.519669,0.408876,
min,2.0,0.0,
25%,2.0,0.0,
50%,2.0,0.0,
75%,2.0,0.0,
max,94.0,6.0,


## Findings

1. Train, Test, Dev are the main datasets to be used for train, test, dev.
2. There are genders, age and accent columns but they are sparse.
3. Segment column is always empty.

## Tasks

1. Will need a Dataset Loading functionality. Can use path for that.
2. Will need a Language Tokenizer and Encoder.
3. Will need a speech encoder.

In [132]:
datasetPATH = 'data/external/cv-corpus-8.0-2022-01-19/en/'
clipsPATH = os.path.join(datasetPATH, 'clips')

In [149]:
class CommonVoice(Dataset):
    import pandas as pd
    import numpy as np
        
    def __init__(self, dataset_path: str, split_type: str = 'train', out_channels: int = 2, out_sampling_rate: int = 32000, tokenizer = None):
        
 
        """
        Iterate over a split of CommonVoice dataset.
        
        Parameters
        ----------
        dataset_path: str
            The path where train.tsv, test.tsv and dev.tsv are located.
        
        split_type: str [train, test, dev]
            Loads one of train.tsv, test.tsv or dev.tsv
        
        out_channels: int 
            Number of output channels for audio. 
            Mono = 1, Stereo = 2.
        
        out_sampling_rate: int
            sampling_rate used for standardizing.
        
        tokenizer: transformers.PreTrainedTokenizerFast
            tokenizer: tokenizer from huggingface used for tokenizing.
        """
        
        super(CommonVoice).__init__()
                
        self.split_type = split_type
        self.dataset_path = dataset_path
        
        ##Check that out_sampling_rate is an integer. Will probably need to specify how to convert Khz to Hz.
        assert isinstance(out_sampling_rate, int)
        self.out_sampling_rate = out_sampling_rate
        
        ##Check how many output channels are needed.
        if out_channels in [1, 2]:
            self.out_channels = out_channels
        else:
            raise ValueError("Only Mono (out_channels = 1) and Stereo (out_channels = 2) Supported.")
        
        ##Check that dataset exists in the path specified and add clips path.
        if os.path.exists(dataset_path):
            self.clips_path = os.path.join(dataset_path, 'clips')
        else:
            raise ValueError(f"{dataset_path} doesn't exist, please provide a valid path to the dataset.")
        
        ##Check that split_type is one of train, test, dev.
        if split_type in ['train', 'test' , 'dev']:
            fullpath = os.path.join(dataset_path, split_type + '.tsv')
        else:
            raise ValueError("split_type must be one of train, test or dev")
        
        ## Load the dataframe
        self.dataframe = pd.read_csv(fullpath, sep = '\t')
        
        ## Check if tokenizer is passed
        
        if tokenizer == None:
            raise ValueError("tokenizer cannot be None.")
        else:
            self.tokenizer = tokenizer
            
        ## Initialize Preprocessing
        
        self.preprocessing = Preprocessing(out_channels= self.out_channels, out_sampling_rate = self.out_sampling_rate, tokenizer = self.tokenizer)
        
        
    def __len__(self) -> int:
        return len(self.dataframe)
        
    
    def __getitem__(self, idx: Union[int, torch.Tensor, np.ndarray]) -> Dict[str, torch.Tensor]:
        
        if isinstance(idx, torch.Tensor):
            idx = list(idx)
            
        item = self.dataframe.iloc[idx]
        
        sentence = item['sentence']
        
        age = item['age']
        gender = item['gender']
        accent = item['accents']
        
        audio_file_path = os.path.join(self.clips_path, item['path'])
        
        waveform, source_sampling_rate = torchaudio.load(audio_file_path)
        waveform, out_sampling_rate = self.preprocessing.preprocess_waveform(waveform, source_sampling_rate)
        
        melspec = self.preprocessing.extract_features(waveform)
        
        # item = {'waveform': waveform, 'sentence': sentence, 'age': age, 'gender' : gender, 'accent': accent}
        
        item = {'waveform': waveform, 'sentence': sentence, 'melspec': melspec}
        
        return item
    
    
    def __repr__(self):
        return \
    f"""
    CommonVoice Dataset
    -------------------
    
    Loading {self.split_type}.tsv from {os.path.abspath(self.dataset_path)} directory.
        
    Number of Examples: {self.__len__()}
    
    Args:
        Sampling Rate: {self.out_sampling_rate}
        Output Channels: {self.out_channels}
    """
    

In [243]:
class Preprocessing:
    """
    Preprocessing contains utilities for transforming audio and text.

    """
    
    from torchaudio import transforms as T
    
    def __init__(self, out_channels: int = 2 , out_sampling_rate: int = 32000, n_fft : int = 1024, n_mels: int = 128, tokenizer = None):

        self.out_channels = out_channels
        self.out_sampling_rate = out_sampling_rate
        self.tokenizer = tokenizer
        
        self.n_fft = n_fft
        self.n_mels = n_mels
    
        self.mel_spec_transform = T.MelSpectrogram(sample_rate = self.out_sampling_rate,
                                                   n_fft = self.n_fft,
                                                  n_mels = self.n_mels)
        
    def standardize_channels(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Standardizes number of channels in a waveform. 
        
        Args:
            waveform: torch.Tensor (channel, timesteps)
        
        Returns:
            waveform: torch.Tensor (channel, timesteps)
        
        """
        
        num_channels = waveform.shape[0]
        
        if num_channels == self.out_channels:
            return waveform
        
        elif num_channels == 1:            
            return torch.cat((waveform, waveform))
        
        elif num_channels > 2:
            raise TypeError(f"Audio with more than 2 channels are not supported. Wanted maximum of 2 channels, found {num_channels} channels.")
        
        elif self.out_channels == 1:
            return torch.sum(waveform) / num_channels
        
        
    def standardize_sampling_rate(self, waveform: torch.Tensor, sampling_rate: torch.Tensor) -> (torch.Tensor, torch.Tensor):
        """
        Standardize Sampling Rate
        
        Args:
            waveform: torch.Tensor
            sampling_rate: torch.Tensor
            
        Returns:
            waveform: torch.Tensor
            sampling_rate: torch.Tensor
        """
        
        ##If there are more than 1 channels
        if waveform.shape[0] > 1:
        
            resampled = []

            for i in range(waveform.shape[0]):
                resampler = T.Resample(sampling_rate, self.out_sampling_rate)
                resampled_channel = resampler(waveform[i, :])
                resampled.append(resampled_channel)
        
            resampled = torch.stack(resampled)
            return resampled, self.out_sampling_rate
        
        ##If there is only 1 channel
        else:
            return T.Resample(sampling_rate, self.out_sampling_rate)(waveform[0, :]), self.out_sampling_rate

    def extract_features(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Applies Feature Extraction Transforms to a waveform.
        
        Args: waveform
        
            waveform: The input for the feature extraction.
            
        Returns: melspec
        
            melspec: The Mel Spectrogram constructed from the input waveform of shape (channel, n_mels, time).
            
        """
        
        x = self.mel_spec_transform(waveform)
        
        return x
        
        
    def preprocess_waveform(self, waveform: torch.Tensor, sampling_rate: torch.Tensor) -> (torch.Tensor, torch.Tensor):
        """
        
        Preprocess Waveforms. Standardizes channels and sampling rate.
        
        Args: waveform, sampling_rate
        
            waveform: torch.Tensor
            
            sampling_rate: torch.Tensor
            
        Returns:

            waveform: torch.Tensor
            
            sampling_rate: torch.Tensor
        """
        
        
        waveform = self.standardize_channels(waveform)
        
        waveform, out_sampling_rate = self.standardize_sampling_rate(waveform, sampling_rate)
        
        return waveform, out_sampling_rate
    
    def tokenize_sentence(self, sentence: str):
        """
        Tokenizes a given sentence.
        
        Args:
            sentence: string to be tokenized
        
        Returns:
            tokens: list of tokens numericalized
        """

        encoded = self.tokenizer.encode(sentence)
        
        return encoded


## TODO

### Collate Function

1. Pad to highest size

### Transforms
1. Tokenize and Apply Emedding to Sentences

In [244]:
class Collator:
    """
    
    Utility Class for Collation of Batch. Intended to be used as part of the PyTorch DataLoader.
    
    """
    
    def __init__(self, tokenizer = None):
        """
        Initializes the Collator.
        
        Args: tokenizer
        
            tokenizer: Should be a HuggingFace PreTrained Tokenizer.
        
        """
        
        self.tokenizer = tokenizer
    
    def pad(self, tensor: torch.Tensor, target_length: int) -> torch.Tensor:
        
        """
        
        Pads tensor according to the given length.
        
        Args:
            tensor: torch.Tensor
                tensor to be padded
        
            out_length: int
                target length of tensor
        
        Returns:
            tensor: torch.Tensor
                padded tensor
        """
        
        if not isinstance(target_length, int):
            raise ValueError(f"target_length must be an integer. Wanted {int}, have {type(target_length)}")
        
        length = target_length - tensor.shape[-1]

        if length <= 0:
            return tensor
        
        else:
            return F.pad(tensor, (0, length), "constant", 0)
    
    def __call__(self, batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        
        """
        
        Transforms lists of inputs in the batch to Tensors.
            
            1. Applies Tokenization to Sentences.
            
            2. Pads Waveforms and Mel Specs to the maximum lengths in the bacth.
        
        Args: batch
        
            Batch containing the list of dicts from the CommonVoice Dataset.
            
        Returns: batch
        
            Dict of Tensors.
            
            dict = {"waveforms": *padded waveforms*, // (batch, channel, time, amplitude) 
                "sentences": *padded and tokenized sentences*, // (batch, tokens) 
                "mel_specs": *padded mel spectrograms: *} // (batch, channel, n_mels, timeframes)
        """
    
        waveforms = [sample['waveform'] for sample in batch]
        sentences = [sample['sentence'] for sample in batch]
        melspecs = [sample['melspec'] for sample in batch]
        
        waveform_lengths = torch.Tensor([waveform.shape[-1] for waveform in waveforms])
        
        melspecs_lengths = torch.Tensor([melspec.shape[-1] for melspec in melspecs])
        
        max_len_waveform = int(waveform_lengths.max())
        
        max_len_melspecs = int(melspecs_lengths.max())

        padded_sentences = self.tokenizer(sentences, padding=True, return_tensors = 'pt', return_attention_mask=False)

        padded_waveforms = torch.stack([self.pad(waveform, max_len_waveform) for waveform in waveforms])
        
        padded_mel_specs = torch.stack([self.pad(melspec, max_len_melspecs) for melspec in melspecs])
        
        return {"waveforms": padded_waveforms, "waveforms_lengths": waveform_lengths, "sentences": padded_sentences['input_ids'], "melspecs": padded_mel_specs, "melspecs_lengths": melspecs_lengths}

In [247]:
collator = Collator(tokenizer)

In [64]:
try: ##Check if tokenizer is defined
    tokenizer
except NameError as e: ## If tokenizer is not defined then initialize it
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [65]:
train_data = CommonVoice(dataset_path = datasetPATH, split_type = 'train', tokenizer = tokenizer)
train_data


    CommonVoice Dataset
    -------------------
    
    Loading train.tsv from /home/ashim/Projects/DeepSpeech/data/external/cv-corpus-8.0-2022-01-19/en directory.
        
    Number of Examples: 864448
    
    Args:
        Sampling Rate: 32000
        Output Channels: 2
    

In [263]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim: int = 128, num_heads: int = 4, ffn_dim: int = 128, num_layers: int = 4, depthwise_conv_kernel_size: int = 31, dropout: float = 0.3):
        
        super(Encoder, self).__init__()
        
        self.model = torchaudio.models.Conformer(input_dim = input_dim,
                                                 num_heads = num_heads,
                                                 ffn_dim = ffn_dim,
                                                 num_layers = num_layers,
                                                 depthwise_conv_kernel_size = depthwise_conv_kernel_size,
                                                 dropout = dropout)
        
    def forward(self, x: torch.Tensor, x_len: torch.Tensor) -> torch.Tensor:
        
        x, _ = self.model.forward(x, x_len)
        
        return x

In [309]:
class LSTMDecoder(nn.Module):
    
    def __init__(self, input_dim: int = 128, hidden_size: int = 256, num_layers: int = 2, bidirectional: bool = False, output_dim: int = None):
        
        super(LSTMDecoder, self).__init__()
        
        if output_dim == None:
            raise ValueError("Please specify the output size of the vocab.")
            
        self.model = nn.LSTM(input_size = input_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        
        directions = 2 if bidirectional == True else 1
        
        self.ffn = nn.Linear(in_features = hidden_size * directions, out_features = output_dim)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        bsz, msl, hdz = x.shape ##batch_size, max sequence length, hidden dimension size
        
        decoded = []
        
        for t in range(msl):
            
            t_sample = x[:, t, :]
            
            if t == 0: ## Check if h_v variable is initialized
                output, (h_v, c_v) = self.model(t_sample)
            
            else:    
                output, (h_v, c_v) = self.model(t_sample, (h_v, c_v))
                
            word = F.softmax(self.ffn(output), dim = -1).argmax(dim = -1)
            
            decoded.append(word)
        
        return torch.stack(decoded, dim = 1)

In [310]:
vocab_size = tokenizer.vocab_size

In [311]:
encoder = Encoder()
decoder = LSTMDecoder(output_dim = vocab_size)

In [312]:
BATCH_SIZE = 12
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, collate_fn=collator, shuffle=True)

In [313]:
for idx, batch in enumerate(train_loader):
    waveforms = batch['waveforms']
    sentence = batch['sentences']
    melspecs = batch['melspecs']
    
    melspecs_lengths = batch['melspecs_lengths']
    waveforms_lengths = batch['waveforms_lengths']
    
    melspecs = torch.transpose(melspecs, -1, -2) ## Changing to (batch, channel, time, n_mels) to (batch, channel, n_mels, time)
    
    encoded_x = encoder.forward(melspecs, melspecs_lengths)
    
    decoded_y = decoder.forward(encoded_x)
    
    
    break