# Tacotron 2 Training & Evaluation

**Goals:**
- Train Tacotron 2 on LJ Speech dataset
- Evaluate model performance using loss and generated audio
- Replace Docker workflow with native shell and Python calls

In [1]:
Install required packages
!pip install --user numpy scipy matplotlib librosa inflect pillow tqdm unidecode



Download & Preprocess Dataset

In [2]:
# Run the data preparation shell script
!bash scripts/prepare_dataset.sh

Train Tacotron 2 Model

In [3]:
!pip install --user git+https://github.com/NVIDIA/dllogger.git

Collecting git+https://github.com/NVIDIA/dllogger.git
  Cloning https://github.com/NVIDIA/dllogger.git to /scratch/3690043.1.academic-gpu/pip-req-build-ht3lm1yu
  Running command git clone --filter=blob:none --quiet https://github.com/NVIDIA/dllogger.git /scratch/3690043.1.academic-gpu/pip-req-build-ht3lm1yu
  Resolved https://github.com/NVIDIA/dllogger.git to commit 0478734ff7be75adde8d160e04872664d1c62e5f
  Preparing metadata (setup.py) ... [?25done
[?25h

In [4]:
import os
import random

data_dir = 'LJSpeech-1.1'
metadata_path = os.path.join(data_dir, 'metadata.csv')
output_dir = 'filelists'
os.makedirs(output_dir, exist_ok=True)

# Read metadata
with open(metadata_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Shuffle and split
random.seed(42)
random.shuffle(lines)
split = int(0.95 * len(lines))  # 95% train, 5% val
train_lines = lines[:split]
val_lines = lines[split:]

# Format entries as: wavs/LJ001-0001.wav|transcript
def format_line(line):
    parts = line.strip().split('|')
    wav_path = os.path.join(data_dir, 'wavs', f"{parts[0]}.wav")
    return f"{wav_path}|{parts[1]}\n"

with open(os.path.join(output_dir, 'train.txt'), 'w') as f:
    f.writelines([format_line(line) for line in train_lines])

with open(os.path.join(output_dir, 'val.txt'), 'w') as f:
    f.writelines([format_line(line) for line in val_lines])

print(" Generated filelists/train.txt and filelists/val.txt")


✅ Generated filelists/train.txt and filelists/val.txt


In [5]:
import os

# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)

In [None]:
# Directly run training (adjust epochs, lr, etc. as needed)
!python train.py \
  -m Tacotron2 \
  -o . \
  -lr 1e-3 \
  -bs 16 \
  --epochs 1500 \
  --training-files filelists/train.txt \
  --validation-files filelists/val.txt \
  --log-file output/log_tacotron2.txt \
  --cudnn-enabled

DLL 2025-04-10 16:46:40.661370 - PARAMETER output : . 
DLL 2025-04-10 16:46:40.661443 - PARAMETER dataset_path : ./ 
DLL 2025-04-10 16:46:40.661488 - PARAMETER model_name : Tacotron2 
DLL 2025-04-10 16:46:40.661513 - PARAMETER log_file : output/log_tacotron2.txt 
DLL 2025-04-10 16:46:40.661552 - PARAMETER anneal_steps : None 
DLL 2025-04-10 16:46:40.661576 - PARAMETER anneal_factor : 0.1 
DLL 2025-04-10 16:46:40.661599 - PARAMETER config_file : None 
DLL 2025-04-10 16:46:40.661629 - PARAMETER seed : None 
DLL 2025-04-10 16:46:40.661659 - PARAMETER epochs : 1500 
DLL 2025-04-10 16:46:40.661683 - PARAMETER epochs_per_checkpoint : 50 
DLL 2025-04-10 16:46:40.661704 - PARAMETER checkpoint_path :  
DLL 2025-04-10 16:46:40.661724 - PARAMETER resume_from_last : False 
DLL 2025-04-10 16:46:40.661745 - PARAMETER dynamic_loss_scaling : True 
DLL 2025-04-10 16:46:40.661764 - PARAMETER amp : False 
DLL 2025-04-10 16:46:40.661793 - PARAMETER cudnn_enabled : True 
DLL 2025-04-10 16:46:40.661812 - PA

Train WaveGlow Vocoder

In [None]:
!python train.py -m WaveGlow -o output/ --lr 1e-4 --epochs 1000 --bs 10 --log-file output/log_waveglow.txt

## Evaluate Results

In [None]:
# Plot or read logs
import json
import matplotlib.pyplot as plt

with open('output/nvlog.json') as f:
    logs = json.load(f)

train_loss = [x['train_loss'] for x in logs if 'train_loss' in x]
plt.plot(train_loss)
plt.title('Training Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## Inference (Text → Audio)

In [None]:
from inference import synthesize_speech
from waveglow import infer_waveform
import IPython.display as ipd

text = "Hello, Please let this work."
mel = synthesize_speech(model, text)
audio = infer_waveform(mel)
ipd.Audio(audio, rate=22050)