# TimbreNet2 Train

In [1]:
import os
import random
import numpy as np
import tensorflow as tf
from lib.model import TimbreNet_Model
from lib.import_audio_int import import_audio
from lib.specgrams_helper import SpecgramsHelper

os.environ["CUDA_VISIBLE_DEVICES"]="1"
tf.test.is_gpu_available()

True

In [2]:
# run params
RUN_ID = 'ID_0001'
RUN_FOLDER = './run/{}'.format(RUN_ID)

if not os.path.exists(RUN_FOLDER):
    os.mkdir(RUN_FOLDER)
    os.mkdir(os.path.join(RUN_FOLDER, 'weights'))
    os.mkdir(os.path.join(RUN_FOLDER, 'dataset_list'))

mode =  'build' #'load' #

# Data

In [3]:
SEED = 21
TRAIN_SPLIT = 0.95
BATCH_SIZE = 10

spec_helper = SpecgramsHelper(audio_length=64000,
                                  spec_shape=(128, 1024),
                                  overlap=0.75,
                                  sample_rate=16000,
                                  mel_downscale=1)

In [4]:
def pre_process(path):
    
    def read_audio(path):
        audio = tf.audio.decode_wav(tf.io.read_file(path)).audio
        audio = tf.reshape(audio, [1,64000,1])
        return audio
    
    mel = spec_helper.waves_to_melspecgrams(read_audio(path))
    melA = mel[0:43200,:,:,0]/13.82#/13.815511 
    melA = tf.reshape(melA, [128,1024])
    melF = mel[0:43200,:,:,1]/1.00001 
    melF = tf.reshape(melF, [128,1024])
    mel = tf.stack([melA,melF],axis=-1)
    return mel, mel

#Select dataset folder
list_ds = tf.data.Dataset.list_files('./datasets/pianoTriadDataset/audio_augmented_x10/*', shuffle=True, seed=SEED)#.batch(BATCH_SIZE)

#Apply preprocess to the dataset and batch
audio_ds = list_ds.map(pre_process).batch(BATCH_SIZE)

# Architecture

In [5]:
LATENT_DIM = 2
TN_VAE = TimbreNet_Model(LATENT_DIM)

if mode == 'build':
    TN_VAE.save(RUN_FOLDER)
else:
    TN_VAE.load_weights(os.path.join(RUN_FOLDER, 'weights/weights.h5'))
    
TN_VAE.encoder.summary()
TN_VAE.decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 128, 1024, 2 0                                            
__________________________________________________________________________________________________
encoder_conv_in (Conv2D)        (None, 128, 1024, 32 96          encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_conv_0_1 (Conv2D)       (None, 128, 1024, 32 9248        encoder_conv_in[0][0]            
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 128, 1024, 32 128         encoder_conv_0_1[0][0]           
____________________________________________________________________________________________

# Training

In [6]:
LEARNING_RATE = 3e-5
R_LOSS_FACTOR = 10
EPOCHS = 2
PRINT_EVERY_N_BATCHES = 10
INITIAL_EPOCH = 0
###CORREGIR ACA
NUM_IMAGES = 43200

In [7]:
TN_VAE.compile(LEARNING_RATE, R_LOSS_FACTOR)

In [None]:
TN_VAE.train_with_generator(     
    audio_ds
    , epochs = EPOCHS
    , steps_per_epoch = NUM_IMAGES / BATCH_SIZE
    , run_folder = RUN_FOLDER
    , print_every_n_batches = PRINT_EVERY_N_BATCHES
    , initial_epoch = INITIAL_EPOCH
)

Epoch 1/2