<a href="https://colab.research.google.com/github/BrunoReis136/tensorflow/blob/main/TensorFlowTTSadvanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> TTS -  DOWNLOAD DO DATASET LJSPEECH </h1>

In [None]:
!wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar xvjf LJSpeech-1.1.tar.bz2
#download LJSpeech-1.1  dataset de TTS

<h2>CONVERTER ARQUIVOS DE ÁUDIO PARA PADRÕES RATE/CHANNEL/WIDTH</h2>

In [None]:
!pip install pydub

In [70]:
from pydub import AudioSegment
import os
from tqdm import tqdm

input_dir = "LJSpeech-1.1/wavs"
output_dir = "LJSpeech-1.1/wavs16k"

for filename in tqdm(os.listdir(input_dir)):
  if filename.endswith(".wav"):
    filepath = os.path.join(input_dir, filename)
    audio = AudioSegment.from_wav(filepath)

    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)

    output_path = os.path.join(output_dir, filename)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    audio.export(output_path, format="wav")

100%|██████████| 13100/13100 [01:45<00:00, 123.70it/s]


<h2>Dataframe dos metadatas e coluna com caminho dos arquivos</h2>

In [71]:
import pandas as pd

metadata = pd.read_csv('LJSpeech-1.1/metadata.csv',sep="|", header=None)
metadata.columns = ['file_id', 'text', 'normalized_text']
metadata['wav_path'] = metadata['file_id'].apply(lambda x:f'LJSpeech-1.1/wav16k/{x}.wav')

<h2>Extrair Mel com Librosa: FUNÇÃO</h2>

In [72]:
import os
import librosa
import numpy as np
from tqdm import tqdm

def convert_wav_to_mel(input_dir,
                       output_dir,
                       sr=16000,
                       n_fft=1024,
                       hop_length=256,
                       n_mels=80,
                       power=1.0,
                       to_db=True,
                       verbose=True):

    os.makedirs(output_dir, exist_ok=True)
    file_list = [f for f in os.listdir(input_dir) if f.endswith(".wav")]
    iterator = tqdm(file_list, desc="Convertendo WAV → Mel") if verbose else file_list

    for filename in iterator:
        filepath = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename.replace(".wav", ".npy"))

        y, _ = librosa.load(filepath, sr=sr)

        mel = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            power=power
        )

        if to_db:
            mel = librosa.power_to_db(mel, ref=np.max)

        np.save(output_path, mel)

<h2>Extrair Mel com Librosa: EXECUÇÃO</h2>

In [73]:
convert_wav_to_mel(
    input_dir="LJSpeech-1.1/wavs16k",
    output_dir="LJSpeech-1.1/mels"
)

Convertendo WAV → Mel: 100%|██████████| 13100/13100 [03:38<00:00, 60.04it/s]


<h2>Visualização do spectrograma (somente por diversão)</h2>

In [74]:
'''import matplotlib.pyplot as plt
import librosa.display

mel = np.load("LJSpeech-1.1/mels/LJ001-0001.npy")
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel, sr=16000, hop_length=256, x_axis="time", y_axis="mel")
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram")
plt.tight_layout()
plt.show()'''

'import matplotlib.pyplot as plt\nimport librosa.display\n\nmel = np.load("LJSpeech-1.1/mels/LJ001-0001.npy")\nplt.figure(figsize=(10, 4))\nlibrosa.display.specshow(mel, sr=16000, hop_length=256, x_axis="time", y_axis="mel")\nplt.colorbar(format="%+2.0f dB")\nplt.title("Mel Spectrogram")\nplt.tight_layout()\nplt.show()'

<h2>Dados tabulares para Dataframe e Pré Processamento</h2>

In [75]:
import pandas as pd

df = pd.read_csv('LJSpeech-1.1/metadata.csv',
                       sep="|",
                       header=None,
                       names=['file_id', 'text', 'normalized_text'])

df['normalized_text'].fillna(df['text'],inplace=True)
df['normalized_text'].dropna(inplace=True)

# Incluir coluna 'mel_path'
df["mel_path"] = df["file_id"].apply(lambda x: f"LJSpeech-1.1/mels/{x}.npy")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['normalized_text'].fillna(df['text'],inplace=True)


<h2>Tokenizar textos por BPE(byte pair encoding): Função</h2>

In [76]:
import sentencepiece as spm
import os

def train_sentencepiece(
    texts,                 # lista de strings ou pd.Series com frases normalizadas
    input_txt_path='temp_text.txt',  # arquivo temporário para salvar as frases
    model_prefix='spm_model',        # prefixo do modelo e vocab
    vocab_size=200                   # tamanho do vocabulário
):
    # Salva os textos em um arquivo temporário
    with open(input_txt_path, 'w', encoding='utf-8') as f:
        for line in texts:
            f.write(line.strip() + '\n')

    # Treina o modelo SentencePiece
    spm.SentencePieceTrainer.Train(
        input=input_txt_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size
    )

    # (Opcional) Remove o arquivo temporário após treino
    if os.path.exists(input_txt_path):
        os.remove(input_txt_path)

    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model' )

    # Adiciona ao DataFrame
    df["tokens_bpe"] = texts.apply(lambda t: sp.encode(t, out_type=int))

<h2>Tokenizar textos por BPE(byte pair encoding): Aplicação</h2>

In [77]:
train_sentencepiece(
    texts=df['normalized_text'],
    input_txt_path='temp_text.txt',
    model_prefix='spm_model',
    vocab_size=700
)

<h2>Salva o dataframe como arquivo picke(.pkl)</h2>

In [78]:
df.to_pickle('LJSpeech_preprocessed.pkl')

<h2>Cria Classe Dataset para converter tokens/mels em tensores e retornar</h2>

In [79]:
import torch
from torch.utils.data import Dataset
import numpy as np

class TTSDataset(Dataset):
  def __init__(self, dataframe, pad_token=0, max_input_length=None, max_mel_length=None):
    self.df = dataframe
    self.pad_token = pad_token
    self.max_input_length = max_input_length
    self.max_mel_length = max_mel_length

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df.iloc[idx]
    tokens = row['tokens_bpe']
    mel_path = row['mel_path']
    # Carregar mel spec(.npy)
    mel = np.load(mel_path)
    #Converte em tensor
    tokens = torch.LongTensor(tokens)
    mel = torch.tensor(mel, dtype = torch.float32)

    return tokens, mel

<h2>Função de padding dinâmico ao carregar os dados no DataLoader</h2>

In [106]:
def tts_collate_fn(batch):
    input_seqs, mel_specs = zip(*batch)

    # Pad dos tokens
    input_lengths = [len(seq) for seq in input_seqs]
    max_input_length = max(input_lengths)
    input_padded = torch.zeros(len(batch), max_input_length, dtype=torch.long)

    for i, seq in enumerate(input_seqs):
        input_padded[i, :len(seq)] = seq  # ✅ CORRETA

    # Pad dos mel specs (formato = [n_mel, T])
    mel_lengths = [mel.shape[1] for mel in mel_specs]
    max_mel_len = max(mel_lengths)
    n_mels = mel_specs[0].shape[0]
    mel_padded = torch.zeros(len(batch), n_mels, max_mel_len)

    for i, mel in enumerate(mel_specs):
        mel_padded[i, :, :mel.shape[1]] = mel

    mel_padded = mel_padded.transpose(1, 2)

    print("input_ids_padded:", input_padded.shape)
    print("mel_padded:", mel_padded.shape)

    return input_padded, torch.tensor(input_lengths), mel_padded, torch.tensor(mel_lengths)

<h2>Carregando dados com DataLoader</h2>


In [81]:
from torch.utils.data import DataLoader
import pandas as pd

# Carregando Dataframe com token_bpe e mel_path
# se necessário puxar o dataframe em df = pd.read_picke('LJSpeech_preprocessed')
df = pd.read_pickle('LJSpeech_preprocessed.pkl')

dataset = TTSDataset(df)

loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=tts_collate_fn)

for batch in loader:
    input_ids, input_lengths, mel_specs, mel_lengths = batch
    print(input_ids.shape)      # [B, T_text]
    print(mel_specs.shape)      # [B, 80, T_mel]
    break

torch.Size([16, 59])
torch.Size([16, 80, 608])


#  Começando o modelo TACOTRON 2

<h2> Dependências</h2>

In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F

<h2> Classe Embedding + Encoder</h2>

In [83]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, encoder_dim=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Sequential(
            nn.Conv1d(embedding_dim, encoder_dim, kernel_size=5, padding=2),
            nn.BatchNorm1d(encoder_dim),
            nn.ReLU()
        )
        self.lstm = nn.LSTM(encoder_dim, encoder_dim // 2, batch_first=True, bidirectional=True)

    def forward(self, x):
        # x: [B, T]
        x = self.embedding(x)  # [B, T, E]
        x = x.transpose(1, 2)  # [B, E, T]
        x = self.conv1(x)      # [B, C, T]
        x = x.transpose(1, 2)  # [B, T, C]
        output, _ = self.lstm(x)
        return output  # [B, T, C]


<h2>Classe Attention</h2>

In [84]:
class Attention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim):
        super().__init__()
        self.query_proj = nn.Linear(decoder_dim, decoder_dim)
        self.key_proj = nn.Linear(encoder_dim, decoder_dim)
        self.energy_proj = nn.Linear(decoder_dim, 1)

    def forward(self, query, keys):
        # query: [B, decoder_dim]
        # keys:  [B, T_enc, encoder_dim]
        q = self.query_proj(query).unsqueeze(1)  # [B, 1, D]
        k = self.key_proj(keys)                  # [B, T_enc, D]
        energy = self.energy_proj(torch.tanh(q + k))  # [B, T_enc, 1]
        weights = F.softmax(energy.squeeze(-1), dim=-1)  # [B, T_enc]
        context = torch.bmm(weights.unsqueeze(1), keys).squeeze(1)  # [B, encoder_dim]
        return context, weights


<h2>Classe Decoder</h2>

In [98]:
class Decoder(nn.Module):
    def __init__(self, encoder_dim, mel_dim=80, decoder_dim=1024):
        super().__init__()
        self.prenet = nn.Sequential(
            nn.Linear(mel_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.attention = Attention(encoder_dim, decoder_dim)
        self.lstm1 = nn.LSTMCell(256 + encoder_dim, decoder_dim)
        self.linear = nn.Linear(decoder_dim, mel_dim)

    def forward(self, encoder_out, mel_inputs, teacher_forcing=True):
        B, T, _ = mel_inputs.shape
        mel_outputs = []
        attention_weights = []

        h, c = [torch.zeros(B, 1024).to(encoder_out.device)] * 2

        prev_mel = mel_inputs[:, 0, :]  # [B, mel_dim]

        for t in range(1, T):
            prenet_out = self.prenet(prev_mel)
            context, attn = self.attention(h, encoder_out)
            rnn_input = torch.cat([prenet_out, context], dim=-1)
            h, c = self.lstm1(rnn_input, (h, c))
            mel_out = self.linear(h)
            mel_outputs.append(mel_out.unsqueeze(1))
            attention_weights.append(attn.unsqueeze(1))
            prev_mel = mel_inputs[:, t, :] if teacher_forcing else mel_out

        mel_outputs = torch.cat(mel_outputs, dim=1)
        attention_weights = torch.cat(attention_weights, dim=1)
        return mel_outputs, attention_weights


<h2>Postnet (Refinamento final)</h2>

In [97]:
class Postnet(nn.Module):
    def __init__(self, mel_dim=80):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(mel_dim, 512, kernel_size=5, padding=2),
            nn.Tanh(),
            nn.Conv1d(512, mel_dim, kernel_size=5, padding=2)
        )

    def forward(self, x):
        x = x.transpose(1, 2)  # [B, 80, T]
        x = self.conv(x)
        x = x.transpose(1, 2)
        return x


<h2>Modelo Tacotron 2</h2>

In [96]:
class Tacotron2(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.encoder = Encoder(vocab_size)
        self.decoder = Decoder(encoder_dim=512)
        self.postnet = Postnet()

    def forward(self, input_ids, mel_inputs, teacher_forcing=True):
        enc_out = self.encoder(input_ids)  # [B, T, 512]
        mel_outputs, attn = self.decoder(enc_out, mel_inputs, teacher_forcing)
        mel_refined = mel_outputs + self.postnet(mel_outputs)
        return mel_outputs, mel_refined, attn


<h2>Função de perda</h2>

In [88]:
def tacotron_loss(mel_outputs, mel_refined, mel_targets):
    loss1 = F.l1_loss(mel_outputs, mel_targets)
    loss2 = F.l1_loss(mel_refined, mel_targets)
    return loss1 + loss2

<h2> Função de Treino </h2>

In [108]:
def train(model, dataloader, optimizer, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            input_ids, input_lengths, mel_specs, mel_lengths = batch

            input_ids = input_ids.to(device)
            mel_specs = mel_specs.to(device)

            # Forward
            mel_out, mel_postnet, _ = model(input_ids, mel_specs)

            # ✅ Ajuste de tamanhos
            min_len = min(mel_out.shape[1], mel_specs.shape[1], mel_postnet.shape[1])
            mel_out = mel_out[:, :min_len, :]
            mel_postnet = mel_postnet[:, :min_len, :]
            mel_specs = mel_specs[:, :min_len, :]

            # Loss
            loss = tacotron_loss(mel_out, mel_postnet, mel_specs)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")


<h2> Treinamento do modelo </h2>

In [107]:
# Imports
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch import optim

sp = spm.SentencePieceProcessor()
sp.load("spm_model.model")

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modelo
model = Tacotron2(vocab_size=len(sp))  # sp = seu SentencePieceProcessor
model = model.to(device)

# Otimizador
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# DataLoader
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=tts_collate_fn  # já implementado
)

# Iniciar treino
train(model, dataloader, optimizer, device, num_epochs=10)


Epoch 1:   0%|          | 0/819 [00:00<?, ?it/s]

input_ids_padded: torch.Size([16, 62])
mel_padded: torch.Size([16, 622, 80])


  loss1 = F.l1_loss(mel_outputs, mel_targets)
Epoch 1:   0%|          | 0/819 [00:30<?, ?it/s]


RuntimeError: The size of tensor a (621) must match the size of tensor b (622) at non-singleton dimension 1