<a href="https://colab.research.google.com/github/ArkS0001/Tacotron2/blob/main/AudFAKE_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wget

import wget

# Download LJSpeech dataset
url = 'https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2'
wget.download(url)

!tar xjf LJSpeech-1.1.tar.bz2


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=d95d54a4a3832dba9c718d982bdd16280f3cbdb53deb29220a990d95aabba447
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np

class Seq2Seq(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Seq2Seq, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, cell) = self.encoder(x)
        outputs, _ = self.decoder(x, (hidden, cell))
        outputs = self.fc(outputs)
        return outputs

# Hyperparameters
input_dim = 128  # Size of text embeddings
hidden_dim = 256
output_dim = 80  # Number of mel-spectrogram bins

# Instantiate the model
model = Seq2Seq(input_dim, hidden_dim, output_dim).cuda()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [3]:
import os
from torch.utils.data import Dataset, DataLoader
from scipy.io import wavfile

class LJSpeechDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.metadata = self._load_metadata()

    def _load_metadata(self):
        metadata_path = os.path.join(self.data_dir, 'metadata.csv')
        with open(metadata_path, 'r') as f:
            metadata = [line.strip().split('|') for line in f]
        return metadata

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        wav_path = os.path.join(self.data_dir, 'wavs', self.metadata[idx][0] + '.wav')
        _, wav = wavfile.read(wav_path)
        text = self.metadata[idx][1]
        mel_spectrogram = librosa.feature.melspectrogram(y=wav, sr=22050, n_mels=80)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

        if self.transform:
            mel_spectrogram = self.transform(mel_spectrogram)

        return torch.tensor(text_to_sequence(text, ['english_cleaners']), dtype=torch.float32), torch.tensor(mel_spectrogram, dtype=torch.float32)

# Create the dataset and data loader
dataset = LJSpeechDataset(data_dir='LJSpeech-1.1')
data_loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))


In [4]:
num_epochs = 50

for epoch in range(num_epochs):
    for texts, mels in data_loader:
        texts, mels = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True).cuda(), torch.nn.utils.rnn.pad_sequence(mels, batch_first=True).cuda()
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, mels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


ParameterError: Audio data must be floating-point

In [5]:
import torch
from waveglow.denoiser import Denoiser

# Load pre-trained WaveGlow model
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow.eval().cuda()
denoiser = Denoiser(waveglow).cuda()

def mel_to_audio(mel):
    mel = torch.tensor(mel).unsqueeze(0).cuda()
    with torch.no_grad():
        audio = waveglow.infer(mel, sigma=0.666)
        audio = denoiser(audio, strength=0.01)[:, 0]
    return audio.cpu().numpy()

# Example usage
text = "Hello, how are you?"
text_seq = torch.tensor(text_to_sequence(text, ['english_cleaners']), dtype=torch.float32).unsqueeze(0).cuda()
with torch.no_grad():
    mel = model(text_seq).cpu().numpy()
audio = mel_to_audio(mel)

# Save the synthesized audio to a file
sf.write('synthesized_output.wav', audio, 22050)

# Playback the audio
ipd.Audio('synthesized_output.wav')


ModuleNotFoundError: No module named 'waveglow'