In [None]:
config = {
    "dataset_path"       : "",
    "training_data_dir" : "training_data",
    "logs_dir":"",
    "data_dir" : "",
    
    "train_size" : 0.1,
    "train_times": 1,
    "valid_size": 0.2,
    "valid_times" : 1,
    
    "batch_size" : 32,
    "epochs" : 10,
    "learning_rate" : 1e-3,
    
    "pitch_scale" : 1.5,
    "no_pitch_scale" : 0.75,
    
    "optimizer" : "adam",
    "loss" : "custom",
    "layca" : False,
    
    "shuffle" : True,
    "debug" : True,
    "max_audio_time" : 10.0,
    "name" : "Tacotron",
    "new_model" : False
    
}


# Dataset

In [None]:
import os
import pandas as pd

from text_utils import *

folder = os.path.join(config['dataset_path'], config['training_data_dir'])
if not os.path.exists(folder):
    raise ValueError("The data folder does'nt exist, launch preprocessing to create data")
data_df = pd.read_csv(os.path.join(folder, "train.csv"))
mel_dir = "mels"
linear_dir = "linear"
audio_dir = "audio"
for idx, row in data_df.iterrows():
    data_df.at[idx, 'mel_path'] = os.path.join(folder, mel_dir, row['mel_path'])
    data_df.at[idx, 'linear_path'] = os.path.join(folder, linear_dir, row['linear_path'])
    data_df.at[idx, 'processed_audio_path'] = os.path.join(folder, audio_dir, row['processed_audio_path'])
        
vocab = get_vocab()
print("Dataset length : {}".format(len(data_df)))
print("Vocab size : {}".format(len(vocab)))

# Model

In [None]:
fake_model = tacotron._build_trainable_model()
fake_model.load_weights("modeles/Tacotron/best_weights.h5")
harmonie.encoder_model = fake_model.layers[1]
tacotron.decoder_model = fake_model.layers[-1]
tacotron.model = None

In [None]:
import os
import shutil
from hparams import hparams
from tacotron_2 import Tacotron

if not os.path.exists('modeles'): os.mkdir('modeles')

tacotron = Tacotron(hparams = hparams,
                      vocab  = vocab,
                      max_audio_time = config['max_audio_time'],
                      max_len_phrase = None,
                      nom   = config['name'],
                      use_custom_linear = False,
                      create_new_model = config['new_model']
                     )

In [None]:
harmonie.summary()

# Training

In [None]:
import os
import argparse
import numpy as np
import pandas as pd

from hparams import hparams

filtered_data_df = data_df[data_df['mel_frames'] < 900]

history = tacotron.train(filtered_data_df,
                         train_size  = config['train_size'],
                         train_times = config['train_times'],
                         valid_size  = config['valid_size'],
                         valid_times = config['valid_times'],

                         epochs      = config['epochs'],
                         steps       = 5000,
                         batch_size  = config['batch_size'],
                         learning_rate   = config['learning_rate'],
                         
                         summary_step    = 100,
                         prediction_step = 250,
                         evaluation_step = 500,

                         pitch_scale = config['pitch_scale'],
                         no_pitch_scale  = config['no_pitch_scale'],

                         optimizer   = config['optimizer'],
                         loss        = config['loss'],
                         layca       = config['layca'],

                         save_config = True,
                         shuffle     = config['shuffle'],
                         debug       = config['debug'],

                         with_early_stopping   = False,
                         patience_stop  = 3,
                         with_checkpoint   = True,
                         with_tensorboard  = True,
                         with_reduce_lr    = True,
                         reduce_factor     = 0.5,
                         patience_reduce   = 1,
                         monitor   = 'val_loss',

                         workers  = 4,
                         max_queue_size   = 8,
                           
                         training_type = "classic_training"
                        )

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.plot(history['loss'].get_data())

In [None]:
tacotron._build_trainable_model().save("full_model.h5")

# Prediction

In [None]:
import numpy as np
from plot_utils import *

def _range(x):
  return np.min(x), np.max(x)

nb_audio = 1
phrase = [data_df.at[idx, 'text'] for idx in range(nb_audio)]

mel, linear, alignments = tacotron.predict(phrase, out_dir="outputs_test", max_iter=200)[0]

plot_alignment(alignments)
plot_spectrogram(mel)
plot_spectrogram(linear)

print("range mel :", _range(mel))
print("range linear :", _range(linear))
print("range align :", _range(alignments))

lin_son = tacotron.wav_from_linear(linear)
Audio(data=lin_son, rate=22050)

In [None]:
nb_audio = 2
offset = 100
phrases = data_df[data_df['mel_frames'] < 500].sample(nb_audio) #.iloc[offset : offset + nb_audio]

predictions = tacotron.predict_with_target(phrases, out_dir="outputs_test")

In [None]:
from IPython.display import Audio
import audio
from plot_utils import *

num = 1
mel = predictions[num][1]

plot_spectrogram(mel)
son = tacotron.wav_from_mel(mel)
Audio(data=son, rate=22050)

In [None]:
from IPython.display import Audio
import audio
from plot_utils import *

num = 1
linear = predictions[num][2]

plot_spectrogram(linear)
son_linear = tacotron.wav_from_linear(linear)
Audio(data=son_linear, rate=22050)

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Audio
from son import *
from hparams import hparams
import audio

son = np.load(data_df.at[100, 'linear_path'])
print(son.shape)
son = audio.inv_linear_spectrogram(np.transpose(son), hparams)
print(son.shape)
plt.figure(figsize=(15,4))
offset = 0.5
t = 0.005
step = 1
son = son / np.max(son)
noise = np.random.random(son.shape) / 20.
son = son + noise
son = (son * 10000 / np.max(son)).astype(np.int16)
plot_son = son[int(44100 * offset):int(44100 * offset + 44100 * t)]
plt.plot(range(len(plot_son)), plot_son)
plt.scatter(range(len(plot_son)), plot_son)

Audio(data=son, rate=22050 // step)
#audio.save_wav(son, "test.wav", 44100)

In [None]:
from son import *
from plot_utils import *

original_sound = Son.gen_from_wav(phrases.reset_index().at[0, 'original_audio_path'])
original_sound.rate = 22050
original_sound.array = original_sound.array[::2]

mel_sound = Son(son, 22050)

fourrier_o = original_sound.temporal_fft(2048)
fourrier_p = mel_sound.temporal_fft(2048)

for i in range(1):
    print("Original")
    plot(fourrier_o[i,:256], type_graph='bar', linewidth=5, titre="Original")
    print("Predicted")
    plot(fourrier_p[i,:256], type_graph='bar', linewidth=5, titre="Predicted")

modified_p = fourrier_p
modified_p[:,512:] = 0.
new_son = build_son_from_fft(modified_p)
new_son.normalize(37000)
Audio(data=new_son.array, rate=22050)

In [None]:
import tensorflow as tf
import keras.backend as K
from keras.models import *
from keras.layers import *
import numpy as np
import pandas as pd
import threading

help(np.roll)