Tacotron 2 from NVIDA


In [2]:
# Clone the Tacotron2 repository 
import os
if not os.path.exists("Tacotron2"):
    !git clone https://github.com/NVIDIA/tacotron2.git

# Install requirements if needed (this may include additional dependencies)
!pip install -r Tacotron2/requirements.txt


Collecting dllogger (from -r Tacotron2/requirements.txt (line 7))
  Cloning https://github.com/NVIDIA/dllogger (to revision v0.1.0) to /tmp/pip-install-z1ipp8ty/dllogger_e3abc3b8a37e4fdaae00f1df4f60ebaa
  Running command git clone --filter=blob:none --quiet https://github.com/NVIDIA/dllogger /tmp/pip-install-z1ipp8ty/dllogger_e3abc3b8a37e4fdaae00f1df4f60ebaa
  Running command git checkout -q 26a0f8f1958de2c0c460925ff6102a4d2486d6cc
  Resolved https://github.com/NVIDIA/dllogger to commit 26a0f8f1958de2c0c460925ff6102a4d2486d6cc
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting inflect (from -r Tacotron2/requirements.txt (line 3))
  Downloading inflect-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting librosa (from -r Tacotron2/requirements.txt (line 4))
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting resampy==0.3.1 (from -r Tacotron2/r

LJ Speech dataset


In [3]:
import os

dataset_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
dataset_tar = "LJSpeech-1.1.tar.bz2"
dataset_dir = "LJSpeech-1.1"

# Download dataset if not already downloaded
if not os.path.exists(dataset_tar):
    !wget --no-check-certificate {dataset_url}

# Extract dataset if not already extracted
if not os.path.exists(dataset_dir):
    !tar -xjf {dataset_tar}

print("Dataset ready:", os.listdir(dataset_dir))

--2025-04-12 15:08:37--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 185.93.1.245, 2400:52e0:1a00::1207:2
Connecting to data.keithito.com (data.keithito.com)|185.93.1.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [text/plain]
Saving to: ‘LJSpeech-1.1.tar.bz2’


2025-04-12 15:09:28 (51.7 MB/s) - ‘LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]

Dataset ready: ['wavs', 'metadata.csv', 'README']


In [12]:
import sys
from os.path import abspath, dirname

# Add the common directory to PATH so that tacotron2_common modules can be found
current_dir = dirname(abspath("__file__"))
common_dir = abspath(current_dir + '/Tacotron2/tacotron2_common')
if common_dir not in sys.path:
    sys.path.insert(0, common_dir)

# Import model components
try:
    from Tacotron2.tacotron2.model import Tacotron2 as Tacotron2Model
    from Tacotron2.tacotron2.model import LocationLayer, Attention, Prenet, Postnet, Encoder, Decoder
except ImportError as e:
    print("ImportError:", e)
    # Optionally adjust the path if needed:
    # sys.path.append(abspath("Tacotron2"))
    # from tacotron2.model import Tacotron2 as Tacotron2Model

# Import PyTorch and other dependencies
import torch
from torch import nn
from torch import optim
import numpy as np


In [13]:

# Instantiate the model with dummy hyperparameters for testing
# Note: Replace these with appropriate hyperparameters for your setup.
dummy_hparams = {
    "mask_padding": True,
    "n_mel_channels": 80,
    "n_symbols": 148,  # This should correspond to the actual vocabulary size
    "symbols_embedding_dim": 512,
    "encoder_kernel_size": 5,
    "encoder_n_convolutions": 3,
    "encoder_embedding_dim": 512,
    "attention_rnn_dim": 256,
    "attention_dim": 128,
    "attention_location_n_filters": 32,
    "attention_location_kernel_size": 31,
    "n_frames_per_step": 1,
    "decoder_rnn_dim": 256,
    "prenet_dim": 256,
    "max_decoder_steps": 1000,
    "gate_threshold": 0.5,
    "p_attention_dropout": 0.1,
    "p_decoder_dropout": 0.1,
    "postnet_embedding_dim": 512,
    "postnet_kernel_size": 5,
    "postnet_n_convolutions": 5,
    "decoder_no_early_stopping": False
}

model_instance = Tacotron2Model(
    mask_padding=dummy_hparams["mask_padding"],
    n_mel_channels=dummy_hparams["n_mel_channels"],
    n_symbols=dummy_hparams["n_symbols"],
    symbols_embedding_dim=dummy_hparams["symbols_embedding_dim"],
    encoder_kernel_size=dummy_hparams["encoder_kernel_size"],
    encoder_n_convolutions=dummy_hparams["encoder_n_convolutions"],
    encoder_embedding_dim=dummy_hparams["encoder_embedding_dim"],
    attention_rnn_dim=dummy_hparams["attention_rnn_dim"],
    attention_dim=dummy_hparams["attention_dim"],
    attention_location_n_filters=dummy_hparams["attention_location_n_filters"],
    attention_location_kernel_size=dummy_hparams["attention_location_kernel_size"],
    n_frames_per_step=dummy_hparams["n_frames_per_step"],
    decoder_rnn_dim=dummy_hparams["decoder_rnn_dim"],
    prenet_dim=dummy_hparams["prenet_dim"],
    max_decoder_steps=dummy_hparams["max_decoder_steps"],
    gate_threshold=dummy_hparams["gate_threshold"],
    p_attention_dropout=dummy_hparams["p_attention_dropout"],
    p_decoder_dropout=dummy_hparams["p_decoder_dropout"],
    postnet_embedding_dim=dummy_hparams["postnet_embedding_dim"],
    postnet_kernel_size=dummy_hparams["postnet_kernel_size"],
    postnet_n_convolutions=dummy_hparams["postnet_n_convolutions"],
    decoder_no_early_stopping=dummy_hparams["decoder_no_early_stopping"]
)

print(model_instance)


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0-2): 3 x Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (linear_layer): Linear(in_features=80, out_features=256, bias=False)
        )
        (1): LinearNorm(
          (linear_layer): Linear(in_features=256, out_features=256, bias=False)
        )
      )
    )
    (attention_rnn): LSTMCell(768, 256)
    (attention_layer): Attention(
      (query_layer): LinearNorm(
        (linear_layer): Linear(in_features=256, out_features=128, bias=False)
      )
      (memory_layer): LinearNorm(
        (linear_layer): Linear(in_featu

Dataset and Dataloader set up


In [None]:
from torch.utils.data import Dataset, DataLoader

class DummyLJDataset(Dataset):
    """A dummy dataset for illustration. Replace with actual processing code.
    Each sample returns (text_tensor, text_length, mel_tensor, gate_tensor, mel_length)
    """
    def __init__(self, dataset_dir, max_samples=100):
        super(DummyLJDataset, self).__init__()
        self.dataset_dir = dataset_dir
        self.max_samples = max_samples
        # In practice, you would parse the metadata file (e.g., metadata.csv) in LJ Speech
        # and create a list of samples with file paths and corresponding text.
        self.samples = [("This is a sample sentence.", np.random.rand(80, 400).astype(np.float32)) for _ in range(max_samples)]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, mel = self.samples[idx]
        # Dummy tokenization: convert characters to indices (for illustration)
        # In practice, use a proper tokenizer and mapping.
        text_tensor = torch.LongTensor([ord(c) for c in text])
        text_length = torch.LongTensor([len(text_tensor)])
        # Dummy mel spectrogram and gate: In practice, load precomputed features.
        mel_tensor = torch.FloatTensor(mel)  # (n_mel_channels, T)
        gate_tensor = torch.FloatTensor([0])  # Dummy gate signal
        mel_length = torch.LongTensor([mel_tensor.shape[1]])
        return text_tensor, text_length, mel_tensor, gate_tensor, mel_length

# Instantiate the dataset and dataloader
dataset = DummyLJDataset(dataset_dir="LJSpeech-1.1", max_samples=100)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda batch: batch)

print("Dummy dataset sample:", dataset[0])