In [3]:
import numpy as np
import mxnet as mx

|                     |Positional Embbeddings         |     | Frame Embeddings                              |
|---------------------|-------------------------------|-----|-----------------------------------------------|
|Positioning          | Index of word                 |     |Timestamp assigned to word                     |
|Input Representation |[w_1, w_2]                     |     |[[w_1,T_1],[w2,T_2],[<sep>,<sep>],[[w_3,T_3]]  | 
|Stepsize between pos |constant                       |     |variabel                                       |
|Tokens per pos       | 1                             |     | 1-2                                           |



Already implemented class for positional embeddings (layers.py):

In [4]:
def get_positional_embeddings(length, depth) -> np.ndarray:
   
    # (1, depth)
    channels = np.arange(depth // 2).reshape((1, -1))

    # (length, 1)
    positions = np.arange(0, length).reshape((-1, 1))
    scaled_positions = positions / np.power(10000, (2 * channels) / depth)
    # sinusoids:
    sin = np.sin(scaled_positions)
    # cosines:
    cos = np.cos(scaled_positions)
    # interleave: (length, num_embed)
    encodings = np.hstack([sin, cos])
    return encodings

class PositionalEmbeddings(mx.gluon.HybridBlock):
    """
    Takes an encoded sequence and adds sinusoidal or learned positional embeddings as in Vaswani et al, 2017 to it.

    :param weight_type: type of embeddings, fixed or learned.
    :param num_embed: Embedding size.
    :param max_seq_len: Maximum sequence length.
    :param prefix: Name prefix for symbols of this encoder.
    :param scale_up_input: If True, scales input data up by num_embed ** 0.5.
    :param scale_down_positions: If True, scales positional embeddings down by num_embed ** -0.5.
    :param weight_init: Optional initializer for learned embeddings.
    """

    def __init__(self,
                 weight_type: "C.FIXED_POSITIONAL_EMBEDDING",
                 num_embed: 100,
                 max_seq_len: 100,
                 prefix: "pos",
                 scale_up_input: False,
                 scale_down_positions: True) -> None:
        
        self.weight_type = weight_type
        self.num_embed = num_embed
        self.max_seq_len = max_seq_len
        self.scale_up_input = scale_up_input
        self.scale_down_positions = scale_down_positions
        with self.name_scope():
            if self.weight_type == "C.FIXED_POSITIONAL_EMBEDDING" or "C.FRAME_EMBEDDING_SOURCE":
                pos_weight = get_positional_embeddings(length=self.max_seq_len, depth=self.num_embed)
                if self.scale_down_positions:
                    pos_weight *= self.num_embed ** -0.5
                self.weight = self.params.get_constant('weight', pos_weight)
            elif self.weight_type == "C.LEARNED_POSITIONAL_EMBEDDING":
                self.weight = self.params.get('weight', shape=(self.max_seq_len, self.num_embed), init=weight_init)
            else:
                raise ValueError("weight_type '%s' is not supported!" % self.weight_type)

    def hybrid_forward(self, F, data, steps, weight):  # pylint: disable=arguments-differ
        """
        Applies positional embeddings to input data.

        :param data: Input data. Shape: (batch, length or 1, num_embed)
        :param steps: Optional steps input. If given, shape is (batch_size or 1, seq_len,)
        :param weight: Positional embedding constant.
        :return: Data with positional embeddings added
        """
        # (length, num_embed)
        if steps is None:
            # (batch, length, num_embed)
            pos_embedding = F.slice_like(F.expand_dims(weight, axis=0), data, axes=(1,))
        else:
            # (batch_size or 1, seq_len, num_embed)
            pos_embedding = F.Embedding(steps, weight, self.max_seq_len, self.num_embed)

        if self.weight_type == C.FIXED_POSITIONAL_EMBEDDING:
            pos_embedding = F.BlockGrad(pos_embedding)

        if self.scale_up_input:
            data = data * (self.num_embed ** 0.5)

        return F.broadcast_add(data, pos_embedding)

Class for frame embeddings:

In [39]:

class FrameEmbeddings(mx.gluon.HybridBlock):
    """
    Takes an encoded sequence with timestamps, and adds sinusoidal encoded embeddings similar to Vaswani et al. 2017, 
    but uses the timestamp instead of the position.

    :param weight_type: type of embeddings, fixed or learned.
    :param num_embed: Embedding size.
    :param max_seq_len: Maximum sequence length.
    :param prefix: Name prefix for symbols of this encoder.
    :param scale_up_input: If True, scales input data up by num_embed ** 0.5.
    :param scale_down_positions: If True, scales positional embeddings down by num_embed ** -0.5.
    :param weight_init: Optional initializer for learned embeddings.
    """

    def __init__(self,
                 weight_type: "C.FRAME_EMBEDDING_SOURCE",
                 num_embed: 100,
                 max_seq_len: 100,
                 prefix: "sth",
                 scale_up_input: False,
                 scale_down_positions: True) -> None:

        super().__init__(prefix=prefix)
        self.weight_type = weight_type
        self.num_embed = num_embed
        self.max_seq_len = max_seq_len
        self.scale_up_input = scale_up_input
        self.scale_down_positions = scale_down_positions

        with self.name_scope():
            pos_weight = get_frame_embeddings(length=self.max_seq_len, depth=self.num_embed)
            if self.scale_down_positions:
                pos_weight *= self.num_embed ** -0.5
            self.weight = self.params.get_constant('weight', pos_weight)
           
            
    def hybrid_forward(self, F, data, steps, weight):  # pylint: disable=arguments-differ
        """
        Applies frame embeddings to input data.

        :param data: Input data. Shape: (batch, length or 1, num_embed)
        :param steps: Optional steps input. If given, shape is (batch_size or 1, seq_len,)
        :param weight: Positional embedding constant.
        :return: Data with positional embeddings added
        """

        # (length, num_embed)
        if steps is None:
            # (batch, length, num_embed)
            frame_embedding = F.slice_like(F.expand_dims(weight, axis=0), data, axes=(1,))

        else:
            # (batch_size or 1, seq_len, num_embed)
            tokens, frames = F.split(data, num_outputs = 2, axis = 2)

            frames = frames.squeeze(axis=2)
            frames = frames.squeeze(axis=2)
            tokens = tokens.squeeze(axis=2)
            tokens = tokens.squeeze(axis=2)
         
            new_weights = weight.take(frames)
   
            new_weights = new_weights.reshape(shape=(-3, 0))

            #Padding the new weights array such that its shape is (self.config.vocab_size, self.config.num_embed)

            padding = F.zeros((self.max_seq_len, self.num_embed))

            padded_weights = F.concat(new_weights, padding, dim=0)

            padded_weights = F.slice(padded_weights, begin=(0,0), end=(self.max_seq_len, self.num_embed))

         
            frame_embedding = F.Embedding(steps, padded_weights, self.max_seq_len, self.num_embed)

        
        frame_embedding = F.BlockGrad(frame_embedding)

        if self.scale_up_input:
            data = data * (self.num_embed ** 0.5)

        return F.broadcast_add(data, frame_embedding)

In [6]:
depth = 512
length = 96

Data = ["Hello","World"]

Data refers to the data, and F is a symbolic representation of the block, and can be more or less ignored.

The get_embeddings function calculates weights for each position. For time frames, the same frame should have the same weight. So I'll initialize one array with continuous weights, and then double it:

In [7]:
def get_frame_embeddings(length, depth) -> np.ndarray:
    
    # (1, depth)
    channels = np.arange(depth // 2).reshape((1, -1))

    # (length, 1)
    positions = np.arange(0, length).reshape((-1, 1))
    scaled_positions = positions / np.power(10000, (2 * channels) / depth)
    # sinusoids:
    sin = np.sin(scaled_positions)
    # cosines:
    cos = np.cos(scaled_positions)
    # interleave: (length, num_embed)
    encodings = np.hstack([sin, cos])
    return encodings

In [8]:
encodings_frames = get_frame_embeddings(length, depth)
encodings_pos = get_positional_embeddings(length, depth)

In [9]:
print(encodings_pos.ndim)

2


In [10]:
print(encodings_frames.ndim)

2


In [11]:
Data_frames = [["Hello",0],["World",2],["<sep>","<sep>"],["Buongiorno",0],["Hi",1]]

#While preprocessing, the words are replaced by numbers, so...

Data = mx.nd.array([[[5,0],[6,2],[4,4],[10,0],[11,1]], [[7,0],[7,1],[6,3],[4,4],[11,0]]])



In [12]:
print(Data.shape)

(2, 5, 2)


The array with the source sentences needs to be reshaped such that it is still as long as the array with the target sentences, and that the last dimension has length 1. This way, the array can be read by the parallel sample iter.

This means that the source array has one dimension more than the target array (4 vs. 3), but this does only need minor changes, whereas the shape (x,y,2) seems to cause major problems with mxnet itself (got errors like "can't broadcast array").

In [13]:
Data = Data.reshape(2,5,2,1)
print(Data)


[[[[ 5.]
   [ 0.]]

  [[ 6.]
   [ 2.]]

  [[ 4.]
   [ 4.]]

  [[10.]
   [ 0.]]

  [[11.]
   [ 1.]]]


 [[[ 7.]
   [ 0.]]

  [[ 7.]
   [ 1.]]

  [[ 6.]
   [ 3.]]

  [[ 4.]
   [ 4.]]

  [[11.]
   [ 0.]]]]
<NDArray 2x5x2x1 @cpu(0)>


In [14]:
print(Data.ndim)

4


Code snippet for data_io.py: create_batch_from_sample. Makes sure that the slicing corresponds to the increased number of dimensions.

In [15]:
source_words = mx.nd.slice(Data, begin=(None, None, 0), end=(None, None, 2)).squeeze(axis=3, inplace=True)
print(source_words)


[[[ 5.  0.]
  [ 6.  2.]
  [ 4.  4.]
  [10.  0.]
  [11.  1.]]

 [[ 7.  0.]
  [ 7.  1.]
  [ 6.  3.]
  [ 4.  4.]
  [11.  0.]]]
<NDArray 2x5x2 @cpu(0)>


In [16]:
source_tokens = mx.nd.slice(Data, begin=(None, None, 0), end=(None, None, 1)).squeeze(axis=3, inplace=True)
print(source_tokens)


[[[ 5.]
  [ 6.]
  [ 4.]
  [10.]
  [11.]]

 [[ 7.]
  [ 7.]
  [ 6.]
  [ 4.]
  [11.]]]
<NDArray 2x5x1 @cpu(0)>


In [17]:
source_length = mx.nd.sum((source_tokens != 0), axis=1).squeeze(axis=1, inplace=True)
print(source_length)


[5. 5.]
<NDArray 2 @cpu(0)>


In [18]:
source_shape = Data.shape
samples = source_shape[0]
tokens = source_shape[1]

print(samples, tokens)

2 5


The embedding process is actually done by mxnet.symbol.embedding. The code behind works as shown below (source: https://mxnet.apache.org/versions/1.7.0/api/python/docs/api/symbol/symbol.html):


In [19]:
input_dim = 4
output_dim = 5

#Each row in weight matrix y represents a word. So, y = (w0,w1,w2,w3)
y = [[  0.,   1.,   2.,   3.,   4.],
     [  5.,   6.,   7.,   8.,   9.],
     [ 10.,  11.,  12.,  13.,  14.],
     [ 15.,  16.,  17.,  18.,  19.]]

#Input array x represents n-grams(2-gram). So, x = [(w1,w3), (w0,w2)]
x = [[ 1.,  3.],
     [ 0.,  2.]]

#Mapped input x to its vector representation y.
#Embedding(x, y, 4, 5)
Embedding = [[[  5.,   6.,   7.,   8.,   9.],
            [ 15.,  16.,  17.,  18.,  19.]],
            [[  0.,   1.,   2.,   3.,   4.],
            [ 10.,  11.,  12.,  13.,  14.]]]

In [20]:
weights = get_frame_embeddings(10, 2)
print(weights)

[[ 0.          1.        ]
 [ 0.84147098  0.54030231]
 [ 0.90929743 -0.41614684]
 [ 0.14112001 -0.9899925 ]
 [-0.7568025  -0.65364362]
 [-0.95892427  0.28366219]
 [-0.2794155   0.96017029]
 [ 0.6569866   0.75390225]
 [ 0.98935825 -0.14550003]
 [ 0.41211849 -0.91113026]]


The goal is to deliver only the weights for which a token is available. This should work, because the position is already encoded with either sine or cosine function. This way, the algorithm will also be able to see which words usually occur close together.

Another goal is to separate mouthings and signs from each other, so the two arrays can be summed or concatenated.

In [21]:
maxlen = 10

tokens, frames = Data.split(num_outputs=2, axis = 2)


weights = mx.nd.array(weights)

tokens = tokens.squeeze(axis = 2)
tokens = tokens.squeeze(axis = 2)

frames = frames.squeeze(axis = 2)
frames = frames.squeeze(axis = 2)

new_weights = weights.take(frames)
print(new_weights)



[[[ 0.          1.        ]
  [ 0.9092974  -0.41614684]
  [-0.7568025  -0.6536436 ]
  [ 0.          1.        ]
  [ 0.84147096  0.5403023 ]]

 [[ 0.          1.        ]
  [ 0.84147096  0.5403023 ]
  [ 0.14112    -0.9899925 ]
  [-0.7568025  -0.6536436 ]
  [ 0.          1.        ]]]
<NDArray 2x5x2 @cpu(0)>


In [22]:
print(new_weights.ndim)

3


In [23]:
print(weights.ndim)

2


In [24]:
print(tokens.shape)

(2, 5)


In [25]:
print(Data.shape)

(2, 5, 2, 1)


For the embedding layer (which I can't really test here), the shape of new_weights must be (input_dim, output_dim). The output dim is easy to find, since it corresponds to the number of positions / embeddings calculated. The input dim in sockeye is currently the size of the vocab, but that doesn't work in this case. Doubling the weight matrix does only work in this example, but it is very unlikely that this is a reliable solution for sockeye. Another problem is that sockeye works with symbolic programming at this point, which means that it is nearly impossible to print out the dimensions of any matrices.

The goal is to find a number for the input dim that can be constructed in a logical, reliable way either from tokens, frames, vocabulary or embed_weights, and to which new_weights can be reshaped.

Explanation for Embedding Layer: https://mxnet.apache.org/versions/1.6/api/r/docs/api/mx.symbol.Embedding.html


Embed_weight is initialized with shape(vocab, output_dim) in sockeye, that is why it normally works with F.Embeddings. In this case, the shape of embed_weights is going to change constantly.

I cannot read out the shape of new_weights directly in MX_net, but I seem to be able to call a method named shape_array, which returns the shape of new_weights as array. I then can multiply the first and second index to get a reliable input_dim and configuration for a 2d array F.Embedding can read. One of the big questions is if it makes sense to pad the new_weights array until it is as long as the vocabulary.

mx.symbol.reshape provides an easy solution for the reshaping problem: new_weights.reshape(-3,output_dim) should have the desired effect, since -3 says that "the product of two consecutive dimensions of the input shape" should be used as output shape (source: https://mxnet.apache.org/versions/1.6/api/r/docs/api/mx.symbol.Reshape.html).

It makes sense to pad the weight array up to (vocab_size, num_embeds), since that might be a format sockeye expects to work with later.

Let's assume the length of the vocab is 70. Therefore, the shape of the new weight array should be (70,2), but we currently have the shape (10,2)

In [26]:
new_weights = new_weights.reshape(len(weights),2)
print(new_weights)


[[ 0.          1.        ]
 [ 0.9092974  -0.41614684]
 [-0.7568025  -0.6536436 ]
 [ 0.          1.        ]
 [ 0.84147096  0.5403023 ]
 [ 0.          1.        ]
 [ 0.84147096  0.5403023 ]
 [ 0.14112    -0.9899925 ]
 [-0.7568025  -0.6536436 ]
 [ 0.          1.        ]]
<NDArray 10x2 @cpu(0)>


In [27]:
shapes = new_weights.shape_array() #shape_array is also available in mx.symbol

expected_length = mx.nd.array([70], dtype='int64')

actual_length = shapes.slice(begin=0,end=1)

print(actual_length.dtype)
print(expected_length.dtype)

#shape and dtype of actual and expected_length need to be the same, or subtracting one array from the 
#other doesn't work

<class 'numpy.int64'>
<class 'numpy.int64'>


In [28]:

needed_length = expected_length - actual_length

print(needed_length)


[60]
<NDArray 1 @cpu(0)>


In [29]:
pad_shape = mx.ndarray.concat(needed_length, mx.nd.array([2], dtype='int64'), dim=0)

print(pad_shape)




[60  2]
<NDArray 2 @cpu(0)>


In [30]:
padding = mx.ndarray.zeros((60,2)) #theoretically, this works with the needed_length array in symbolic programming
#it doesn't work with needed_lenght and ndarrays in imperative programming, that's why I inserted the tuple here.
print(padding.shape)

(60, 2)


In [31]:
new_weights_2 = mx.ndarray.concat(new_weights, padding, dim = 0)
print(new_weights_2.shape)

(70, 2)


The last proposed solution doesn't work, because symbol.zeros also needs a tuple as input, and there is no easy way to get an int value from needed_length into that tuple.

Next idea: create an array with zeros with shape = (vocab_size, num_embeds) (a.k.a the target shape), concat it to the weights array, and then slice the weight array such that the shape of the weight array is (vocab_size, num_embeds). That should work, because vocab_size and num_embeds are both integers that can be called through config, not symbols.

In [32]:
print(new_weights.shape)

(10, 2)


In [33]:
padding = mx.ndarray.zeros((70,2))

print(padding.shape)

(70, 2)


In [37]:
new_weights_3 = mx.ndarray.concat(new_weights, padding, dim = 0)

new_weights_3 = mx.ndarray.slice(new_weights_3, begin = (0,0), end=(70,2))
print(new_weights_3.shape)

(70, 2)


In [38]:
print(new_weights_3)


[[ 0.          1.        ]
 [ 0.9092974  -0.41614684]
 [-0.7568025  -0.6536436 ]
 [ 0.          1.        ]
 [ 0.84147096  0.5403023 ]
 [ 0.          1.        ]
 [ 0.84147096  0.5403023 ]
 [ 0.14112    -0.9899925 ]
 [-0.7568025  -0.6536436 ]
 [ 0.          1.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]


And that finally works with mxnet's symbolic programming API.