testing

In [1]:
import torch
print(torch.__version__)


2.5.0+cu118


In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))



True
0
NVIDIA GeForce GTX 1660 Ti


In [6]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tacotron2.hparams import create_hparams
from tacotron2.model import Tacotron2
from tacotron2.data_utils import TextMelLoader, TextMelCollate

import torch.nn.functional as F

# Create hyperparameters
hparams = create_hparams()

# Initialize Tacotron2 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Tacotron2(hparams).to(device)

# Load pre-trained weights if available (optional)
# Uncomment and set the path to load weights
# checkpoint = torch.load('tacotron2_statedict.pt', map_location=device)
# model.load_state_dict(checkpoint)

# Define optimizer, loss function, and hyperparameters
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.L1Loss()  # L1 loss is typically better for TTS tasks

# Prepare the dataset and DataLoader
train_dataset = TextMelLoader('train_data.csv', hparams)  # Ensure the dataset is implemented properly
collate_fn = TextMelCollate(hparams.n_frames_per_step)  # Ensure this is implemented correctly
train_loader = DataLoader(train_dataset, batch_size=hparams.batch_size, shuffle=True, collate_fn=collate_fn)

# Training loop
epochs = hparams.epochs  # Use the epochs defined in hparams

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    
    for i,(phoneme_batch, mel_batch, gate_targets) in enumerate(train_loader):
        phoneme_batch = phoneme_batch.to(device)
        mel_batch = mel_batch.to(device)
        gate_targets = gate_targets.to(device)  # Ensure gate_targets are passed from the DataLoader
        
        # Forward pass through the Tacotron2 model
        outputs, mel_outputs_postnet, gate_outputs, alignments = model(phoneme_batch, mel_batch)
        
        # Loss calculation: use postnet outputs and L1 loss for mel-spectrogram prediction
        mel_loss = criterion(mel_outputs_postnet, mel_batch)
        gate_loss = F.binary_cross_entropy_with_logits(gate_outputs, gate_targets)
        
        loss = mel_loss + gate_loss
        
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Save the model after training
torch.save(model.state_dict(), 'tacotron2_model_trained.pth')

# Testing (inference)
model.eval()
with torch.no_grad():
    # Example phoneme sequence for testing
    sample_phoneme_sequence = torch.tensor([1, 2, 3, 4, 5, 6]).unsqueeze(0).to(device)  # Adjust based on your phoneme encoding
    
    # Tacotron2 inference method for generating mel spectrograms
    generated_mel, alignments = model.inference(sample_phoneme_sequence)
    print("Generated mel spectrogram shape:", generated_mel.shape)


ValueError: File format b'\x93NUM' not understood. Only 'RIFF', 'RIFX', and 'RF64' supported.

In [3]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)


2.5.0+cu118
True
11.8
