In [1]:
    # -*- coding: utf-8 -*-
#/usr/bin/python2
'''
By kyubyong park. kbpark.linguist@gmail.com. 
https://www.github.com/kyubyong/dc_tts
'''

from __future__ import print_function, division
from hyperparams import Hyperparams as hp
from modules import *
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from data_load import get_batch, load_vocab, load_data
from networks import TextEnc, AudioEnc, AudioDec, Attention, SSRN
from utils import *
import sys
import pickle
import time

In [2]:
def embed(inputs, vocab_size, num_units, zero_pad=True):
    '''Embeds a given tensor. 
    
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scope: Optional scope for `variable_scope`.  
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality
        should be `num_units`.
    '''
    initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1.)
    outputs = tf.keras.layers.Embedding(vocab_size, num_units, input_length=10,embeddings_initializer = initializer,mask_zero = True)(inputs)

    return outputs

In [3]:
def normalize(inputs):
    '''Applies layer normalization that normalizes along the last axis.

    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`. The normalization is over the last dimension.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    outputs = tf.keras.layers.BatchNormalization(axis=-1,momentum=0.99,epsilon=0.001,center=True,scale=True)(inputs)
    return outputs

In [4]:
def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
    '''Highway networks, see https://arxiv.org/abs/1505.00387

    Args:
      inputs: A 3D tensor of shape [N, T, W].
      num_units: An int or `None`. Specifies the number of units in the highway layer
             or uses the input size if `None`.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A 3D tensor of shape [N, T, W].
    '''
    if not num_units:
        num_units = inputs.get_shape()[-1]

   
    H = tf.keras.layers.Dense(units=num_units, activation='relu', name="dense1")(inputs)
    T = tf.keras.layers.Dense(units=num_units, activation='sigmoid',
                        bias_initializer = tf.constant_initializer(-1.0), name="dense2")(inputs)
    outputs = H * T + inputs * (1 - T)
    return outputs

In [5]:
def conv1d(inputs,
           filters=None,
           size=1,
           rate=1,
           padding="SAME",
           dropout_rate=0,
           use_bias=True,
           activation_fn=None):
    '''
    Args:
      inputs: A 3-D tensor with shape of [batch, time, depth].
      filters: An int. Number of outputs (=activation maps)
      size: An int. Filter size.
      rate: An int. Dilation rate.
      padding: Either `same` or `valid` or `causal` (case-insensitive).
      dropout_rate: A float of [0, 1].
      use_bias: A boolean.
      activation_fn: A string.
      training: A boolean. If True, dropout is applied.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A masked tensor of the same shape and dtypes as `inputs`.
    '''
    
    if padding.lower() == "causal":
        # pre-padding for causality
        pad_len = (size - 1) * rate  # padding size
        inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
        padding = "valid"
    
    print(inputs)
    if filters is None:
        filters = inputs.get_shape().as_list()[-1]

    params = {"filters": filters, "kernel_size": size,
              "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
              "kernel_initializer": 'glorot_uniform'}

    tensor = tf.keras.layers.Conv1D(**params)(inputs)
    tensor = normalize(tensor)
    if activation_fn is not None:
        tensor = activation_fn(tensor)
    

    tensor = tf.keras.layers.Dropout(rate=dropout_rate)(tensor)

    return tensor

In [6]:
def hc(inputs,
       filters=None,
       size=1,
       rate=1,
       padding="SAME",
       dropout_rate=0,
       use_bias=True,
       activation_fn=None):
    '''
    Args:
      inputs: A 3-D tensor with shape of [batch, time, depth].
      filters: An int. Number of outputs (=activation maps)
      size: An int. Filter size.
      rate: An int. Dilation rate.
      padding: Either `same` or `valid` or `causal` (case-insensitive).
      use_bias: A boolean.
      activation_fn: A string.
      training: A boolean. If True, dropout is applied.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A masked tensor of the same shape and dtypes as `inputs`.
    '''
    _inputs = inputs
   
    if padding.lower() == "causal":
        # pre-padding for causality
        pad_len = (size - 1) * rate  # padding size
        inputs = tf.pad(tensor=inputs, paddings=[[0, 0], [pad_len, 0], [0, 0]])
        padding = "valid"

    if filters is None:
        filters = inputs.get_shape().as_list()[-1]


    params = {"filters": 2*filters, "kernel_size": size,
              "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
              "kernel_initializer": 'glorot_uniform'}

    tensor =  tf.keras.layers.Conv1D(**params)(inputs)
    H1, H2 = tf.split(tensor, 2, axis=-1)
    H1 = normalize(H1)
    H2 = normalize(H2)
    H1 = tf.nn.sigmoid(H1, "gate")
    H2 = activation_fn(H2, "info") if activation_fn is not None else H2
    tensor = H1*H2 + (1.-H1)*_inputs

    tensor = tf.keras.layers.Dropout(rate=dropout_rate)(tensor)

    return tensor

In [7]:
def conv1d_transpose(inputs,
                     filters=None,
                     size=3,
                     stride=2,
                     padding='same',
                     dropout_rate=0,
                     use_bias=True,
                     activation=None):
    '''
        Args:
          inputs: A 3-D tensor with shape of [batch, time, depth].
          filters: An int. Number of outputs (=activation maps)
          size: An int. Filter size.
          rate: An int. Dilation rate.
          padding: Either `same` or `valid` or `causal` (case-insensitive).
          dropout_rate: A float of [0, 1].
          use_bias: A boolean.
          activation_fn: A string.
          training: A boolean. If True, dropout is applied.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.

        Returns:
          A tensor of the shape with [batch, time*2, depth].
        '''
    
    if filters is None:
        filters = inputs.get_shape().as_list()[-1]
    inputs = tf.expand_dims(inputs, 1)
    tensor = tf.keras.layers.Conv2DTranspose(filters=filters,
                               kernel_size=(1, size),
                               strides=(1, stride),
                               padding=padding,
                               use_bias=use_bias,
                               activation=None,
                               kernel_initializer='glorot_uniform')(inputs)
    tensor = tf.squeeze(tensor, 1)
    tensor = normalize(tensor)
    if activation is not None:
        tensor = activation(tensor)

    tensor = tf.keras.layers.Dropout(rate=dropout_rate)(tensor)

    return tensor

In [11]:
def TextEnc(L):
    '''
    Args:
      L: Text inputs. (B, N)

    Return:
        K: Keys. (B, N, d)
        V: Values. (B, N, d)
    '''
    i = 1
    tensor = embed(L,
                   vocab_size=len(hp.vocab),
                   num_units=hp.e
                   ); i += 1
    tensor = conv1d(tensor,
                    filters=2*hp.d,
                    size=1,
                    rate=1,
                    dropout_rate=hp.dropout_rate,
                    activation_fn=tf.nn.relu
                    ); i += 1
    tensor = conv1d(tensor,
                    size=1,
                    rate=1,
                    dropout_rate=hp.dropout_rate
                    ); i += 1

    for _ in range(2):
        for j in range(4):
            tensor = hc(tensor,
                            size=3,
                            rate=3**j,
                            dropout_rate=hp.dropout_rate,
                            activation_fn=None
                            ); i += 1
    for _ in range(2):
        tensor = hc(tensor,
                        size=3,
                        rate=1,
                        dropout_rate=hp.dropout_rate,
                        activation_fn=None
                               ); i += 1

    for _ in range(2):
        tensor = hc(tensor,
                        size=1,
                        rate=1,
                        dropout_rate=hp.dropout_rate,
                        activation_fn=None
                        ); i += 1

    K, V = tf.split(tensor, 2, -1)
    return K, V


### Testing

In [None]:
for i in sorted_with_len_2000.take(1):
    L,mels,mags,fnames,text_len,num_batch = i
TextEnc(L)

In [None]:
rank_2 = tf.constant([[1, 2],
             [3, 4],
             [5, 6]], dtype=tf.float32)

rank_3_tensor = tf.constant([
  [[0, 1, 2, 3, 4],
   [5, 6, 7, 8, 9]],
  [[10, 11, 12, 13, 14],
   [15, 16, 17, 18, 19]],
  [[20, 21, 22, 23, 24],
   [25, 26, 27, 28, 29]]],dtype = tf.float32)

rank_4_tensorf = tf.constant([[[[1],
         [2]],

        [[3],
         [4]]]], dtype = tf.float32)

X = np.asarray([[1, 2], [3, 4]])
X.reshape(1,2,2,1)

In [9]:
conv1d_transpose(rank_3_tensor,
                 size = 1,
                 stride = 1,
                 dropout_rate=hp.dropout_rate)

<tf.Tensor: shape=(3, 2, 5), dtype=float32, numpy=
array([[[ -2.6060991 ,   3.5536134 ,   2.3781004 ,  -5.6829653 ,
          -0.67413896],
        [ -8.718557  ,   8.101439  ,  12.403968  , -14.943922  ,
          -3.0228567 ]],

       [[-14.831017  ,  12.649266  ,  22.429834  , -24.204878  ,
          -5.371574  ],
        [-20.943472  ,  17.197092  ,  32.455704  , -33.465836  ,
          -7.7202916 ]],

       [[-27.055933  ,  21.74492   ,  42.48157   , -42.72679   ,
         -10.06901   ],
        [-33.16839   ,  26.292746  ,  52.50744   , -51.987747  ,
         -12.417727  ]]], dtype=float32)>

In [5]:
tf.keras.layers.Conv2DTranspose(
    filters = 1, kernel_size = (1,1), strides=(1, 1), padding='same', 
    activation=None, use_bias=True,
    kernel_initializer='glorot_uniform'
)(rank_4_tensorf)

<tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
array([[[[-0.71179974],
         [-1.4235995 ]],

        [[-2.1353993 ],
         [-2.847199  ]]]], dtype=float32)>

In [27]:
hc(rank_3_tensor,
       filters=None,
       size=1,
       rate=1,
       padding="SAME",
       dropout_rate=0,
       use_bias=True,
       activation_fn=None)

<tf.Tensor: shape=(3, 2, 5), dtype=float32, numpy=
array([[[  0.77673054,   2.0795434 ,   0.38490015,  -3.396175  ,
           1.7059772 ],
        [  4.3645577 ,   7.3039794 ,   0.8794489 ,  -8.212064  ,
           5.450384  ]],

       [[  9.80634   ,  12.350378  ,   1.5717201 , -12.696313  ,
           9.593467  ],
        [ 14.964674  ,  17.394682  ,   2.4607725 , -17.18036   ,
          13.7371235 ]],

       [[ 19.994486  ,  22.438965  ,   3.5453234 , -21.66441   ,
          17.880783  ],
        [ 24.999203  ,  27.483248  ,   4.82378   , -26.148455  ,
          22.02444   ]]], dtype=float32)>

In [21]:
conv1d(rank_3_tensor,
        filters=hp.d,
        size=1,
        rate=1,
        padding="CAUSAL",
        dropout_rate=hp.dropout_rate,
        activation_fn=tf.nn.relu)

tf.Tensor(
[[[ 0.  1.  2.  3.  4.]
  [ 5.  6.  7.  8.  9.]]

 [[10. 11. 12. 13. 14.]
  [15. 16. 17. 18. 19.]]

 [[20. 21. 22. 23. 24.]
  [25. 26. 27. 28. 29.]]], shape=(3, 2, 5), dtype=float32)


<tf.Tensor: shape=(3, 2, 256), dtype=float32, numpy=
array([[[0.        , 0.00751375, 0.6594122 , ..., 0.        ,
         0.        , 0.01869379],
        [0.        , 0.        , 0.5120433 , ..., 0.        ,
         0.        , 0.13660263]],

       [[0.        , 0.        , 0.36467442, ..., 0.        ,
         0.        , 0.25451145],
        [0.        , 0.        , 0.21730527, ..., 0.        ,
         0.        , 0.37242022]],

       [[0.        , 0.        , 0.0699366 , ..., 0.        ,
         0.        , 0.49032924],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.6082379 ]]], dtype=float32)>

In [None]:
char2idx, idx2char = load_vocab()
fpaths, text_lengths, texts = load_data()

In [None]:
L, mels, mags, fnames, num_batch = get_batch()

In [None]:
num_batch = len(fnames)//hp.B

In [None]:
data_text = [[L[i],mels[i],mags[i], fnames[i],len(L[i]),num_batch] for i in range(len(L))]

In [None]:
data_text[0]

In [None]:
data_text.sort(key = lambda x : x[4])

In [None]:
sorted_with_len = [(sent[0],sent[1],sent[2],sent[3],sent[4],sent[5]) for sent in data_text if sent[4] > 10]

In [None]:
sorted_with_len[13099]

In [None]:
sorted_with_len_13099 = []
for i in range(12000,13100):
    sorted_with_len_13099.append(sorted_with_len[i])
with open('data/sorted_with_len_13099','wb') as f:
    pickle.dump(sorted_with_len_13099,f)

In [None]:
start_time = time.time()
with open('data/sorted_with_len_2000','rb') as f:
    sorted_with_len_2000 = pickle.load(f)
with open('data/sorted_with_len_4000','rb') as f:
    sorted_with_len_4000 = pickle.load(f)
with open('data/sorted_with_len_6000','rb') as f:
    sorted_with_len_6000 = pickle.load(f)
with open('data/sorted_with_len_8000','rb') as f:
    sorted_with_len_8000 = pickle.load(f)
with open('data/sorted_with_len_10000','rb') as f:
    sorted_with_len_10000 = pickle.load(f)
with open('data/sorted_with_len_12000','rb') as f:
    sorted_with_len_12000 = pickle.load(f)
with open('data/sorted_with_len_13099','rb') as f:
    sorted_with_len_13099 = pickle.load(f)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
sorted_with_len = sorted_with_len_2000+sorted_with_len_4000+sorted_with_len_6000+sorted_with_len_8000+sorted_with_len_10000+sorted_with_len_12000+sorted_with_len_13099

In [None]:
start_time = time.time()
all_dataset = tf.data.Dataset.from_generator(lambda : sorted_with_len,
                                            output_types = (tf.int32, tf.float32,tf.float32,tf.string, tf.int32,tf.int32))
pad_batch = all_dataset.padded_batch(32, padded_shapes=((None,),(None,hp.n_mels),(None,hp.n_fft//2+1),(),(),()))
print("--- %s seconds ---" % (time.time() - start_time))