<a href="https://colab.research.google.com/github/BrunoReis136/tensorflow/blob/main/TensorFlowTTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> TTS -  DOWNLOAD DO DATASET LJSPEECH </h1>

In [None]:
!wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar xvjf LJSpeech-1.1.tar.bz2
#download LJSpeech-1.1  dataset de TTS

<h1>TTS - Funções utilizadas</h1>

In [22]:
import tensorflow as tf
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ast

#--------------------------Áudio para spectrograma
def load_mel(file_path):
  y, sr = librosa.load(file_path, sr=22050)
  mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=80)
  mel_db = librosa.power_to_db(mel, ref=np.max)
  return mel_db

#--------------------------Texto para token
def tokenize_text(text):
    return [symbol_to_id[c] for c in text if c in symbol_to_id]

#--------------------------Texto para spectrograma
def text_to_mel(text, model, max_len=200):
    # Tokenizar e fazer padding
    tokens = tokenize_text(text)
    token_input = pad_sequences([tokens], maxlen=max_len, padding='post')

    # Prever mel spectrogram
    pred_mel = model.predict(token_input)

    # Remover batch dimension
    mel = pred_mel[0].T  # Transpor para shape (n_mels, time)
    return mel

#--------------------------Spectrograma para audio
def mel_to_audio(mel_spec, sr=22050):
    # Desfaz o dB
    mel_spec = librosa.db_to_power(mel_spec)

    # Reconstrução com Griffin-Lim
    audio = librosa.feature.inverse.mel_to_audio(
        mel_spec, sr=sr, n_fft=1024, hop_length=256, n_iter=60
    )
    return audio

#---------------------------Formação do arquivo WAV
def generate_audio_from_text(text, model, output_path='output.wav'):
    mel = text_to_mel(text, model)
    audio = mel_to_audio(mel)
    sf.write(output_path, audio, samplerate=22050)
    print(f"Áudio salvo em: {output_path}")
    return audio

<h2>TTS - PREPARAÇÃO DOS DADOS </h2>



In [None]:
import tensorflow as tf
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ast

#==============Dataset LJSpeech em dataframe Pandas=================

df = pd.read_csv('LJSpeech-1.1/metadata.csv',
                       sep="|",
                       header=None,
                       names=['file_id', 'text', 'normalized_text'])

audios_path = []

for x in df['file_id']:
  audios_path.append(f'LJSpeech-1.1/wavs/{x}.wav')

df['audio_path'] = audios_path


#===============Função para texto em spectrogram==================
symbols = [
    '_',  # padding
    ' ',  # espaço
    '!', "'",'"', '(', ')', ',', '-', '.', ':', ';', '?',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]
#Variáveis necessárias:
symbol_to_id = {s: i for i, s in enumerate(symbols)}
id_to_symbol = {i: s for i, s in enumerate(symbols)}
vocab_size = len(symbols)

#===================Normalizar dados do dataframe(texto>spectro / áudio>spectro)==============

df['mel_spec'] = df['audio_path'].apply(load_mel)
df['tokenized_text'] = df['normalized_text'].apply(lambda x:tokenize_text(str(x)))

#Padding dos dados em variáveis de lista
x_text = pad_sequences(df['tokenized_text'].tolist(), padding='post')

max_len = max([mel.shape[1] for mel in df['mel_spec']])
y_mel = np.array([
    np.pad(mel, ((0, 0), (0, max_len - mel.shape[1])), mode='constant', constant_values=-80.0).T
    for mel in df['mel_spec']
])

#=================Salvar os dados em disco (poupar RAM para o treinamento)======================
np.save('x_text.npy', x_text)
np.save('y_mel.npy', y_mel)
df.to_pickle('df.pkl')

<h1> TSS - CONSTRUÇÃO MODELO E TREINAMENTO

In [None]:
import tensorflow as tf
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ast
#====================Recuperar os dados de treinamento=====================
x_text = np.load('x_text.npy')
y_mel = np.load('y_mel.npy')
df = pd.read_pickle('df.pkl')

symbols = [
    '_',  # padding
    ' ',  # espaço
    '!', "'",'"', '(', ')', ',', '-', '.', ':', ';', '?',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]

vocab_size = len(symbols)

#====================Construção do modelo==================================
embedding_dim = 256
encoder_units = 512
decoder_units = 1024
mel_dim = df['mel_spec'][0].shape[0]

def build_tts_encoder_decoder():
    #Entrada, embedding
    text_input = layers.Input(shape=(None,), dtype='int32', name='text_input')
    embed = layers.Embedding(input_dim = vocab_size, output_dim=embedding_dim)(text_input)

    #Encoder bidirecional com memória reforçada(LSTM)
    encoder_outputs = layers.Bidirectional(layers.LSTM(encoder_units, return_sequences=True))(embed)

    #Attention(reforço do encoder em si mesmo)
    attention = layers.Attention()([encoder_outputs, encoder_outputs])

    #Decoder para gerar spectrogramas memória reforçada (LSTM)
    decoder_lstm = layers.LSTM(decoder_units, return_sequences=True)(attention)
    mel_output = layers.TimeDistributed(layers.Dense(mel_dim))(decoder_lstm)

    #Montagem do modelo final para retornar
    model = models.Model(inputs=text_input, outputs=mel_output)
    return model

#===========================Aplicação e treinamento do modelo=================================
model = build_tts_encoder_decoder()
model.compile(optimizer='adam', loss='mse')
model.fit(x_text, y_mel, batch_size=32, epochs=10, validation_split=0.1)

#===========================Salvando o modelo=============================
model.save('tts_model.h5')

<h1> TTS - TESTE DE UTILIZAÇÃO DO MODELO </h1>

In [None]:
from IPython.display import Audio

text = input("Digite o texto: ")
generate_audio_from_text(text, model, output_path='audio_gerado.wav')
Audio("audio_gerado.wav")

<h3> Opcionais </h3>

In [23]:
#=============Dataframe para excel (opcional para inspeção)===============
#df.to_excel('LJSpeech.xlsx', index=False)

#==============Trecho opcional para executar áudio e exibir spectrogram============

mel_db = load_mel(audio_path)

# AudioDisplay and Plot the Mel spectrogram
n = 10
file_name = df.iloc[n]['file_id']
print(df.iloc[n]['text'])

audio_path = f'LJSpeech-1.1/wavs/{file_name}.wav'

display(Audio(filename=audio_path))

plt.figure(figsize=(10, 4))

librosa.display.specshow(mel_db.T, sr=22050, hop_length=256, x_axis='time', y_axis='mel')

plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram (dB)')
plt.tight_layout()
plt.show()



NameError: name 'audio_path' is not defined

<h1>NOVA ABORDAGEM</h1>

<h2>CONVERTER ARQUIVOS DE ÁUDIO PARA PADRÕES RATE/CHANNEL/WIDTH</h2>

In [None]:
!pip install pydub

In [9]:
from pydub import AudioSegment
import os
from tqdm import tqdm

input_dir = "LJSpeech-1.1/wavs"
output_dir = "LJSpeech-1.1/wavs16k"

os.makedirs(os.path.dirname(output_path), exist_ok=True)

for filename in tqdm(os.listdir(input_dir)):
  if filename.endswith(".wav"):
    filepath = os.path.join(input_dir, filename)
    audio = AudioSegment.from_wav(filepath)

    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)

    output_path = os.path.join(output_dir, filename)
    audio.export(output_path, format="wav")

100%|██████████| 13100/13100 [01:13<00:00, 177.34it/s]


<h2>Testando o formato dos áudios convertidos</h2>

In [None]:
import wave

with wave.open("LJSpeech-1.1/wavs16k/LJ001-0001.wav", "rb") as wav_file:
    print("Frequência:", wav_file.getframerate())
    print("Canais:", wav_file.getnchannels())
    print("Sample width:", wav_file.getsampwidth())  # Deve ser 2 (16-bit)

<h2>Dataframe dos metadatas e coluna com caminho dos arquivos</h2>

In [None]:
import pandas as pd

metadata = pd.read_csv('LJSpeech-1.1/metadata.csv',sep="|", header=None)
metadata.columns = ['file_id', 'text', 'normalized_text']
metadata['wav_path'] = metadata['file_id'].apply(lambda x:f'LJSpeech-1.1/wav16k/{x}.wav')

<h2>Extrair Mel com Librosa</h2>

In [None]:
import librosa
import numpy as np

def wav_to_mel(wav_path, sr=16000, n_mels=80, hot_lenght=256, win_lenght=1024):
  y, _=
