Tacotron 2 from NVIDA


In [3]:
# Clone the Tacotron2 repository 
import os
if not os.path.exists("Tacotron2"):
    !git clone https://github.com/NVIDIA/tacotron2.git

# Install requirements if needed (this may include additional dependencies)
!pip install -r Tacotron2/requirements.txt


Collecting dllogger (from -r Tacotron2/requirements.txt (line 7))
  Cloning https://github.com/NVIDIA/dllogger (to revision v0.1.0) to /tmp/pip-install-lebf6wcd/dllogger_b92ef5a75e7e4306b909a499054a2436
  Running command git clone --filter=blob:none --quiet https://github.com/NVIDIA/dllogger /tmp/pip-install-lebf6wcd/dllogger_b92ef5a75e7e4306b909a499054a2436
  Running command git checkout -q 26a0f8f1958de2c0c460925ff6102a4d2486d6cc
  Resolved https://github.com/NVIDIA/dllogger to commit 26a0f8f1958de2c0c460925ff6102a4d2486d6cc
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


LJ Speech dataset


In [5]:
import os

dataset_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
dataset_tar = "LJSpeech-1.1.tar.bz2"
dataset_dir = "LJSpeech-1.1"

# Download dataset if not already downloaded
# if not os.path.exists(dataset_tar):
#     !wget --no-check-certificate {dataset_url}

# Extract dataset if not already extracted
if not os.path.exists(dataset_dir):
    !tar -xjf {dataset_tar}

print("Dataset ready:", os.listdir(dataset_dir))

Dataset ready: ['wavs', 'metadata.csv', 'README']


In [6]:
import sys
from os.path import abspath, dirname

# Add the common directory to PATH so that tacotron2_common modules can be found
current_dir = dirname(abspath("__file__"))
common_dir = abspath(current_dir + '/Tacotron2/tacotron2_common')
if common_dir not in sys.path:
    sys.path.insert(0, common_dir)

# Import model components
try:
    from Tacotron2.tacotron2.model import Tacotron2 as Tacotron2Model
    from Tacotron2.tacotron2.model import LocationLayer, Attention, Prenet, Postnet, Encoder, Decoder
except ImportError as e:
    print("ImportError:", e)
    # Optionally adjust the path if needed:
    # sys.path.append(abspath("Tacotron2"))
    # from tacotron2.model import Tacotron2 as Tacotron2Model

# Import PyTorch and other dependencies
import torch
from torch import nn
from torch import optim
import numpy as np


In [7]:

# Instantiate the model with dummy hyperparameters for testing
# Note: Replace these with appropriate hyperparameters for your setup.
dummy_hparams = {
    "mask_padding": True,
    "n_mel_channels": 80,
    "n_symbols": 148,  # This should correspond to the actual vocabulary size
    "symbols_embedding_dim": 512,
    "encoder_kernel_size": 5,
    "encoder_n_convolutions": 3,
    "encoder_embedding_dim": 512,
    "attention_rnn_dim": 256,
    "attention_dim": 128,
    "attention_location_n_filters": 32,
    "attention_location_kernel_size": 31,
    "n_frames_per_step": 1,
    "decoder_rnn_dim": 256,
    "prenet_dim": 256,
    "max_decoder_steps": 1000,
    "gate_threshold": 0.5,
    "p_attention_dropout": 0.1,
    "p_decoder_dropout": 0.1,
    "postnet_embedding_dim": 512,
    "postnet_kernel_size": 5,
    "postnet_n_convolutions": 5,
    "decoder_no_early_stopping": False
}

model_instance = Tacotron2Model(
    mask_padding=dummy_hparams["mask_padding"],
    n_mel_channels=dummy_hparams["n_mel_channels"],
    n_symbols=dummy_hparams["n_symbols"],
    symbols_embedding_dim=dummy_hparams["symbols_embedding_dim"],
    encoder_kernel_size=dummy_hparams["encoder_kernel_size"],
    encoder_n_convolutions=dummy_hparams["encoder_n_convolutions"],
    encoder_embedding_dim=dummy_hparams["encoder_embedding_dim"],
    attention_rnn_dim=dummy_hparams["attention_rnn_dim"],
    attention_dim=dummy_hparams["attention_dim"],
    attention_location_n_filters=dummy_hparams["attention_location_n_filters"],
    attention_location_kernel_size=dummy_hparams["attention_location_kernel_size"],
    n_frames_per_step=dummy_hparams["n_frames_per_step"],
    decoder_rnn_dim=dummy_hparams["decoder_rnn_dim"],
    prenet_dim=dummy_hparams["prenet_dim"],
    max_decoder_steps=dummy_hparams["max_decoder_steps"],
    gate_threshold=dummy_hparams["gate_threshold"],
    p_attention_dropout=dummy_hparams["p_attention_dropout"],
    p_decoder_dropout=dummy_hparams["p_decoder_dropout"],
    postnet_embedding_dim=dummy_hparams["postnet_embedding_dim"],
    postnet_kernel_size=dummy_hparams["postnet_kernel_size"],
    postnet_n_convolutions=dummy_hparams["postnet_n_convolutions"],
    decoder_no_early_stopping=dummy_hparams["decoder_no_early_stopping"]
)

print(model_instance)


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0-2): 3 x Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (linear_layer): Linear(in_features=80, out_features=256, bias=False)
        )
        (1): LinearNorm(
          (linear_layer): Linear(in_features=256, out_features=256, bias=False)
        )
      )
    )
    (attention_rnn): LSTMCell(768, 256)
    (attention_layer): Attention(
      (query_layer): LinearNorm(
        (linear_layer): Linear(in_features=256, out_features=128, bias=False)
      )
      (memory_layer): LinearNorm(
        (linear_layer): Linear(in_featu

Dataset and Dataloader set up


In [None]:
import sys
import os
import csv
import torch
import numpy as np
import librosa  # for audio loading
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

# Set the path to your dataset
DATASET_DIR = "./LJSpeech-1.1/"
METADATA_PATH = os.path.join(DATASET_DIR, "metadata.csv")
WAVS_DIR = os.path.join(DATASET_DIR, "wavs/")

# Define a simple character vocabulary.
# You can adjust the characters based on your needs.
characters = "abcdefghijklmnopqrstuvwxyz '!,?."
# Create a mapping from character to an index.
# Reserve 0 for padding.
char2idx = {ch: idx+1 for idx, ch in enumerate(characters)}

def text_to_sequence(text):
    """
    Convert text into a sequence of indices using a character-based mapping.
    Only characters in the vocabulary are used; all others are ignored.
    """
    text = text.lower()
    sequence = [char2idx.get(ch, 0) for ch in text if ch in char2idx]
    return sequence

# Custom function for computing mel spectrograms.
# Replace this dummy implementation with your actual mel computation function.
def compute_mel(wav_path, sr=22050, n_mel_channels=80):
    # Load the audio file with librosa.
    y, _ = librosa.load(wav_path, sr=sr)
    # For demonstration: create a dummy mel spectrogram with random values.
    # In practice, implement your real mel spectrogram computation here.
    T = np.random.randint(200, 400)  # random time dimension between 200 and 400 frames.
    mel = np.random.rand(n_mel_channels, T).astype(np.float32)
    return mel

# Define the LJ Speech Dataset class.
class LJSpeechDataset(Dataset):
    def __init__(self, metadata_path, wavs_dir, max_samples=None):
        self.wavs_dir = wavs_dir
        self.samples = []
        # Read metadata.csv where each line is: ID|normalized text|original text.
        with open(metadata_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter='|')
            for row in reader:
                if len(row) >= 2:
                    file_id = row[0].strip()
                    text = row[1].strip()
                    wav_path = os.path.join(wavs_dir, file_id + ".wav")
                    self.samples.append((file_id, text, wav_path))
                    if max_samples is not None and len(self.samples) >= max_samples:
                        break

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        file_id, text, wav_path = self.samples[index]
        # Tokenize the text using our character-level tokenizer.
        sequence = text_to_sequence(text)
        text_tensor = torch.LongTensor(sequence)
        text_length = torch.LongTensor([len(sequence)])
        
        # Compute the mel spectrogram using the custom function.
        mel = compute_mel(wav_path)
        mel_tensor = torch.FloatTensor(mel)  # shape: (n_mel_channels, T)
        mel_length = torch.LongTensor([mel_tensor.shape[1]])
        
        # Create a dummy gate signal.
        # In a full implementation, the gate target indicates the end of the sequence.
        gate_tensor = torch.FloatTensor([0])
        
        return {
            'text': text_tensor,
            'text_length': text_length,
            'mel': mel_tensor,
            'mel_length': mel_length,
            'gate': gate_tensor
        }

# Define the collate function to pad sequences in a batch.
def lj_collate_fn(batch):
    """
    Custom collate function for the LJSpeech dataset.

    Each sample in the batch is a dictionary with keys:
      - 'text': 1D LongTensor of tokenized text.
      - 'text_length': 1D LongTensor containing length of the text.
      - 'mel': 2D FloatTensor of shape (n_mel_channels, T) (variable T).
      - 'mel_length': 1D LongTensor containing number of frames in mel spectrogram.
      - 'gate': 1D FloatTensor (dummy signal; last frame 1, others 0).

    This function:
      1. Sorts the batch by text length in descending order.
      2. Pads the text sequences into one tensor.
      3. Pads the mel spectrograms along the time dimension.
      4. Creates a padded tensor for gate signals.
    """
    # Sort batch by text length (descending order)
    batch.sort(key=lambda x: x['text'].size(0), reverse=True)

    # Extract the fields
    texts = [sample['text'] for sample in batch]
    text_lengths = torch.tensor([sample['text'].size(0) for sample in batch], dtype=torch.long)

    mels = [sample['mel'].transpose(0, 1) for sample in batch]  # transpose so shape becomes (T, n_mel_channels)
    mel_lengths = torch.tensor([mel.size(0) for mel in mels], dtype=torch.long)
    n_mel_channels = batch[0]['mel'].size(0)

    # Pad text sequences (padding value 0 for padding token)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)

    # Pad mel spectrograms; note that these are variable in time dimension (T)
    # We'll pad along the time axis so that each spectrogram becomes (T_max, n_mel_channels)
    mels_padded = pad_sequence(mels, batch_first=True, padding_value=0)  # shape: (batch, T_max, n_mel_channels)
    mels_padded = mels_padded.transpose(1, 2)  # shape: (batch, n_mel_channels, T_max)

    # Create padded gate signals.
    # For demonstration: all zeros, with the last valid frame set to 1.
    max_mel_len = mels_padded.size(2)
    gates = []
    for sample in batch:
        mel_len = sample['mel'].size(1)
        # Create a gate tensor: zeros for frames 0...mel_len-2, and 1 for the final valid frame.
        gate = torch.zeros(mel_len)
        if mel_len > 0:
            gate[-1] = 1
        gates.append(gate)
    gates_padded = pad_sequence(gates, batch_first=True, padding_value=0)

    return {
        'text': texts_padded,          # LongTensor with shape (batch, max_text_length)
        'text_length': text_lengths,     # LongTensor with shape (batch)
        'mel': mels_padded,              # FloatTensor with shape (batch, n_mel_channels, max_mel_length)
        'mel_length': mel_lengths,       # LongTensor with shape (batch)
        'gate': gates_padded             # FloatTensor with shape (batch, max_mel_length)
    }

# Instantiate the dataset and dataloader.
dataset = LJSpeechDataset(METADATA_PATH, WAVS_DIR, max_samples=100)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lj_collate_fn)

# Check one batch to verify the setup.
batch = next(iter(dataloader))
print("Batch keys:", batch.keys())
print("Text batch shape:", batch['text'].shape)
print("Mel batch shape:", batch['mel'].shape)
print("Gate batch shape:", batch['gate'].shape)
print(batch['mel_length'])


Batch keys: dict_keys(['text', 'text_length', 'mel', 'mel_length', 'gate'])
Text batch shape: torch.Size([4, 137])
Mel batch shape: torch.Size([4, 80, 315])
Gate batch shape: torch.Size([4, 315])
tensor([236, 256, 315, 298])
