In [1]:
import torch
import torchaudio
from torch.utils.data import Dataset

import numpy as np
import os # for file path manipulation

import csv # for reading tsv files


# custom dataset class
class SpeechDataset(Dataset):
    def __init__(self, tsvs=[], sample_rate=16000, transform=None, columns=['path']):
        self.tsvs = tsvs
        self.sample_rate = sample_rate
        self.transform = transform
        self.columns = columns
        self.data = []

        # load metadata
        self._load_metadata()

    def _load_metadata(self):
        self.data = []
        for tsv in self.tsvs:
            dir_path, _ = os.path.split(tsv)
            
            clips = os.path.join(dir_path, 'clips', '')
            
            # read tsv and append to data
            with open(tsv, 'r') as f:
                reader = csv.DictReader(f, delimiter='\t')
                for row in reader:
                    # commonvoice columns:
                    # client_id	path	sentence	up_votes	down_votes	age	gender	accents	variant	locale	segment
                    
                    # get columns
                    data = [row[col] for col in self.columns]
                    if 'path' in self.columns:
                        # convert path to absolute path
                        path_idx = self.columns.index('path')
                        data[path_idx] = clips + data[path_idx]
                    # append to data
                    self.data.append(data)


        # shuffle data
        np.random.shuffle(self.data)

    def get_column_names(self):
        # if path is included, last column is audio data that will be loaded in __getitem__
        if 'path' in self.columns:
            # self.columns + ['audio']
            return self.columns + ['audio']
        else:
            return self.columns
        

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # load data
        sample = self.data[idx]
        # load audio (if path is in sample)
        if 'path' in self.columns:
            # load audio
            #print(sample[self.columns.index('path')])
            audio, sample_rate = torchaudio.load(sample[self.columns.index('path')])
            
            # resample audio if necessary
            if sample_rate != self.sample_rate:
                resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
                audio = resampler(audio)

            
            # add audio to sample
            sample.append(audio)

        # apply transform if necessary
        if self.transform:
            sample = self.transform(sample)

        return sample


In [2]:
dataset = SpeechDataset(tsvs=[
    'commonvoice\\cv-corpus-16.0-delta-2023-12-06\\en\\validated.tsv',
    'commonvoice\\cv-corpus-16.0-delta-2023-12-06\\de\\validated.tsv', 
    'commonvoice\\cv-corpus-16.0-delta-2023-12-06\\ja\\validated.tsv'], columns=['path', 'sentence'])

print('Dataset length:', len(dataset))
print('Dataset columns:', dataset.get_column_names())

import random
# get first sample
sample = dataset[random.randint(0, len(dataset))]
print('Sample:', sample)

# get audio from sample
audio = sample[-1]

# play audio
from IPython.display import Audio


# Play the audio using IPython's Audio widget
audio_widget = Audio(data=audio.numpy(), rate=16000)
display(audio_widget)

FileNotFoundError: [Errno 2] No such file or directory: 'commonvoice\\cv-corpus-16.0-delta-2023-12-06\\de\\validated.tsv'

In [None]:

# baseline models
import torch.nn as nn

SAMPLE_RATE = 16000

def pad_batch(batch):
    # pads batch to longest sequence
    # batch is list of samples
    lengths = [len(sample) for sample in batch]
    max_length = max(lengths)
    # pad to max length
    padded_batch = [torch.nn.functional.pad(sample, (0, max_length - len(sample))) for sample in batch]
    return padded_batch

class BaselineEmbedder(nn.Module):
    def __init__(self, sample_rate = SAMPLE_RATE, embedding_dim=32):
        super(BaselineEmbedder, self).__init__()
        self.sample_rate = sample_rate
        self.embedding_dim = embedding_dim

        # lstm layers
        self.lstm = nn.LSTM(input_size=1, hidden_size=embedding_dim, num_layers=3, batch_first=True)

    
    def forward(self, x):
        # x is audio, clips are padded to longest sequence
        # x is (batch_size, samples)

        # reshape to (batch_size, samples, 1)
        x = x.unsqueeze(2)
        x = self.lstm(x)
        # get last hidden state
        x = x[0][:, -1, :]
        x = x.reshape(-1, self.embedding_dim)
        return x
    


In [None]:
baseline = BaselineEmbedder()
print(baseline)

batch = [dataset[random.randint(0, len(dataset))][-1] for _ in range(16)]
batch = [sample[-1] for sample in batch]
batch = pad_batch(batch)
batch = torch.stack(batch)

print('Input shape:', batch.shape)

# get embeddings
embeddings = baseline(batch)
print('Embeddings shape:', embeddings.shape)


BaselineEmbedder(
  (lstm): LSTM(1, 32, num_layers=3, batch_first=True)
)
Input shape: torch.Size([16, 122112])
Embeddings shape: torch.Size([16, 32])
