In [1]:
from keras.layers import Input, Lambda,RepeatVector,Dense,Reshape,Dropout
from keras.models import Model
from keras import backend as K
import keras

def repeat_vector(args):
        layer_to_repeat = args[0]
        sequence_layer = args[1]
        return RepeatVector(K.shape(sequence_layer)[1])(layer_to_repeat)
    
def concatenate_vectors(args):
    return K.concatenate([args[0],args[1]],axis=2)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from keras_transformer import *

import numpy as np
from keras_layer_normalization import LayerNormalization
from keras_multi_head import MultiHeadAttention
from keras_position_wise_feed_forward import FeedForward
from keras_pos_embd import TrigPosEmbedding
from keras_embed_sim import EmbeddingRet, EmbeddingSim


def get_m(token_num,
              embed_dim,
              encoder_num,
              decoder_num,
              head_num,
              hidden_dim,
              attention_activation=None,
              feed_forward_activation='relu',
              dropout_rate=0.0,
              embed_weights =None,
              embed_trainable=None,
              trainable=True,
              use_adapter=False,
              adapter_units=None,
              adapter_activation='relu'):
    """Get full model without compilation.
    :param token_num: Number of distinct tokens.
    :param embed_dim: Dimension of token embedding.
    :param encoder_num: Number of encoder components.
    :param decoder_num: Number of decoder components.
    :param head_num: Number of heads in multi-head self-attention.
    :param hidden_dim: Hidden dimension of feed forward layer.
    :param attention_activation: Activation for multi-head self-attention.
    :param feed_forward_activation: Activation for feed-forward layer.
    :param dropout_rate: Dropout rate.
    :param use_same_embed: Whether to use the same token embedding layer. `token_num`, `embed_weights` and
                           `embed_trainable` should be lists of two elements if it is False.
    :param embed_weights: Initial weights of token embedding.
    :param embed_trainable: Whether the token embedding is trainable. It will automatically set to False if the given
                            value is None when embedding weights has been provided.
    :param trainable: Whether the layers are trainable.
    :param use_adapter: Whether to use feed-forward adapters before each residual connections.
    :param adapter_units: The dimension of the first transformation in feed-forward adapter.
    :param adapter_activation: The activation after the first transformation in feed-forward adapter.
    :return: Keras model.
    """
    decoder_token_num = token_num

    decoder_embed_weights = embed_weights

    if decoder_embed_weights is not None:
        decoder_embed_weights = [decoder_embed_weights]

    decoder_embed_trainable = embed_trainable

    if decoder_embed_trainable is None:
        decoder_embed_trainable = decoder_embed_weights is None


    decoder_embed_layer = EmbeddingRet(
        input_dim=decoder_token_num,
        output_dim=embed_dim,
        mask_zero=True,
        weights=decoder_embed_weights,
        trainable=decoder_embed_trainable,
        name='Decoder-Token-Embedding',
    )
    
    vector = Input(shape=(512,), name='Vectors-Input',dtype='float32')
    positions = Input(shape=(None,10), name='Positions-Input',dtype='float32')
    vectors_repeated = Lambda(repeat_vector, output_shape=(None, 512), name='Vectors-Repeater') ([vector, positions])
    encoded_inputs_concatenated = Lambda(concatenate_vectors, output_shape=(None, 522), name='Encoded-Inputs-Concatenator') ([vectors_repeated, positions])
    encoder_input_divider1 = Dense(embed_dim*4, name='Encoder-Output-Divider-1',activation='selu')(encoded_inputs_concatenated)
    encoder_input_dropout1=Dropout(0.1, name='Encoder-Output-Dropout-1')(encoder_input_divider1)
    encoder_input_divider2 = Dense(embed_dim*2, name='Encoder-Output-Divider-2',activation='selu')(encoder_input_dropout1)
    encoder_input_dropout2=Dropout(0.1, name='Encoder-Output-Dropout-2')(encoder_input_divider2)
    encoder_input_divider3 = Dense(embed_dim, name='Encoder-Output-Divider-3',activation='selu')( encoder_input_dropout2)
    encoder_input_dropout3=Dropout(0.1, name='Encoder-Output-Dropout-3')(encoder_input_divider3)
    encoder_input_divider4 = Dense(embed_dim, name='Encoder-Output-Divider-4',activation='selu')(encoder_input_dropout3)
    
    encoded_layer = get_encoders(
        encoder_num=encoder_num,
        input_layer=encoder_input_divider4,
        head_num=head_num,
        hidden_dim=hidden_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
        trainable=trainable,
        use_adapter=use_adapter,
        adapter_units=adapter_units,
        adapter_activation=adapter_activation,
    )
    
    
    decoder_input = keras.layers.Input(shape=(None,), name='Decoder-Input') 
    decoder_embed, decoder_embed_weights = decoder_embed_layer(decoder_input)
    decoder_embed = TrigPosEmbedding(
        mode=TrigPosEmbedding.MODE_ADD,
        name='Decoder-Embedding',
    )(decoder_embed)
    decoded_layer = get_decoders(
        decoder_num=decoder_num,
        input_layer=decoder_embed,
        encoded_layer=encoded_layer,
        head_num=head_num,
        hidden_dim=hidden_dim,
        attention_activation=attention_activation,
        feed_forward_activation=feed_forward_activation,
        dropout_rate=dropout_rate,
        trainable=trainable,
        use_adapter=use_adapter,
        adapter_units=adapter_units,
        adapter_activation=adapter_activation,
    )
    dense_layer = EmbeddingSim(
        trainable=trainable,
        name='Output',
    )([decoded_layer, decoder_embed_weights])
    return keras.models.Model(inputs=[vector,positions,decoder_input], outputs=dense_layer)

In [3]:
def piecesextender(data):
    kol= np.zeros(shape=(data.shape[0]))
    for i in range(data.shape[0]):
        kol[i]=np.count_nonzero(data[i])
    kol = kol.astype(np.int)
    out = np.zeros(shape=(np.sum(kol),512))
    buf = 0
    for g in range(data.shape[0]):        
        for k in range(1,kol[g]):
            out[buf][0]=3000
            out[buf][1:513-k]+=data[g][k:]
            buf+=1
    return out, kol

def vectorsextender(data,kol):
    out = np.zeros(shape=(np.sum(kol),512))
    buf=0
    for g in range(data.shape[0]):
        for k in range(kol[g]-1):
            out[buf]+=data[g]
            buf+=1
    return out

def positionencoder(kol):
    out = np.zeros(shape=(np.sum(kol),512,10))
    buf=0
    for g in range(len(kol)):
        for k in range(kol[g]-1):
            #out[buf]+=binarycalculator(k+1)
            out[buf]+=binarycalculator(512)
            buf+=1
    return out
            
def binarycalculator(k):
    buf=np.zeros(shape=(10))
    out=np.zeros(shape=(512,10))
    for i in range(k):
        buf=numbertoarray(i+1)
        out[i]=out[i]+buf
    for i in range(512-k):
        out[i+k]=out[i+k]+buf
    return out
    
    
def numbertoarray(m):
    out = np.zeros(shape=(10))
    powers=[512,256,128,64,32,16,8,4,2,1]
    for i in range(10):
        a=m//powers[i]
        if a>=1 and a<2:
            out[i] = 1 
            m-=powers[i]
        if m == 0:
            break
    return out           

In [4]:
model = get_m(
    token_num=3003,
    embed_dim=64,
    encoder_num=3,
    decoder_num=8,
    head_num=4,
    hidden_dim=120,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((3003, 64)),
    use_adapter=True,
)
model.compile(
    optimizer='adagrad',
    loss='sparse_categorical_crossentropy',
)
model.summary()




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Vectors-Input (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Positions-Input (InputLayer)    (None, None, 10)     0                                            
__________________________________________________________________________________________________
Vectors-Repeater (Lambda)       (None, None, 512)    0           Vectors-Input[0][0]              
                                                                 Positions-

In [4]:
from keras.models import load_model
model = load_model ('D:/decoder/models/m5.h5' , custom_objects = get_custom_objects ())
print(K.get_value(model.optimizer.lr))
K.set_value(model.optimizer.lr, .0002)
print(K.get_value(model.optimizer.lr))




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


0.001
0.0002


In [5]:
from scipy.ndimage.interpolation import shift
n=0
e=1
bo = True

for i in range(1000):
    vectors_train = np.load('D:/decoder/data/data3000/news_vectors'+str(i)+'.npy')
    pieces_train = np.load('D:/decoder/data/data3000/news_pieces'+str(i)+'.npy')
    shifting = np.ones(shape=(pieces_train.shape[0],512))
    pieces_train = pieces_train+shifting
    
    print('Readed')
    
    '''
    if i == 32:
        K.set_value(model.optimizer.lr, .005)
        print(K.get_value(model.optimizer.lr))
        model.save('/home/shared/decoder/models/m'+str(i)+'.h5')
        
    if i == 38:
        K.set_value(model.optimizer.lr, .002)
        print(K.get_value(model.optimizer.lr)) 
        model.save('/home/shared/decoder/models/m'+str(i)+'.h5')
        
    if i == 42:
        K.set_value(model.optimizer.lr, .001)
        print(K.get_value(model.optimizer.lr)) 
        model.save('/home/shared/decoder/models/m'+str(i)+'.h5')
        
    if i == 47:
        K.set_value(model.optimizer.lr, .0005)
        print(K.get_value(model.optimizer.lr))
        model.save('/home/shared/decoder/models/m'+str(i)+'.h5')
        bo =True
    '''
    
        
    for j in range(5):
        ptt, kol = piecesextender(pieces_train[0+j*100:(1+j)*100])
        pot = np.copy(ptt)
        pot = shift(pot, (0,-1))
        pot = pot.reshape(pot.shape[0],512,1)
        vtt = vectorsextender(vectors_train[0+j*100:(1+j)*100], kol)
        pos = positionencoder(kol)
        
        
        model.fit(
            x=[vtt,
               pos,
               ptt],
            y = pot,
            epochs=e, initial_epoch=n,
            validation_split=0.05,
            batch_size=6
        )
        n+=1
        e+=1

    if i%5==0 and bo:
        model.save('D:/decoder/models/m'+str(i)+'.h5')

Readed
Train on 5453 samples, validate on 287 samples
Epoch 1/1
Train on 4615 samples, validate on 243 samples
Epoch 2/2
Train on 4569 samples, validate on 241 samples
Epoch 3/3
Train on 5488 samples, validate on 289 samples
Epoch 4/4
Train on 4372 samples, validate on 231 samples
Epoch 5/5
Readed
Train on 5344 samples, validate on 282 samples
Epoch 6/6
Train on 5092 samples, validate on 268 samples
Epoch 7/7
Train on 5563 samples, validate on 293 samples
Epoch 8/8
Train on 4247 samples, validate on 224 samples
Epoch 9/9
Train on 5357 samples, validate on 282 samples
Epoch 10/10
Readed
Train on 5131 samples, validate on 271 samples
Epoch 11/11
Train on 5918 samples, validate on 312 samples
Epoch 12/12
Train on 4452 samples, validate on 235 samples
Epoch 13/13
Train on 4645 samples, validate on 245 samples
Epoch 14/14
Train on 2836 samples, validate on 150 samples
Epoch 15/15
Readed
Train on 2452 samples, validate on 130 samples
Epoch 16/16
Train on 2401 samples, validate on 127 samples

KeyboardInterrupt: 

In [7]:
model.save('D:/decoder/models/m4.h5')

In [None]:
'''
def get_position_encoding(
    length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
  """Return positional encoding.
  Calculates the position encoding as a mix of sine and cosine functions with
  geometrically increasing wavelengths.
  Defined and formulized in Attention is All You Need, section 3.5.
  Args:
    length: Sequence length.
    hidden_size: Size of the
    min_timescale: Minimum scale that will be applied at each position
    max_timescale: Maximum scale that will be applied at each position
  Returns:
    Tensor with shape [length, hidden_size]
  """
  # We compute the positional encoding in float32 even if the model uses
  # float16, as many of the ops used, like log and exp, are numerically unstable
  # in float16.
  position = tf.cast(tf.range(length), tf.float32)
  num_timescales = hidden_size // 2
  log_timescale_increment = (
      math.log(float(max_timescale) / float(min_timescale)) /
      (tf.cast(num_timescales, tf.float32) - 1))
  inv_timescales = min_timescale * tf.exp(
      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  return signal
'''