In [None]:
import  os
import  numpy               as np
import  tensorflow          as tf
from    tensorflow          import keras

# Embedding

### Data

https://nlp.stanford.edu/projects/glove

In [None]:
glove_file  = "glove.6B.50d.txt"

!wget -O {glove_file} https://www.dropbox.com/scl/fi/y328s9lbz8c9glp7al02p/glove.6B.50d.txt?rlkey=m81pwe06f8tpb947fl3y78nlo&dl=0

In [None]:
def read_glove( glove_file ):
    """
    Read the embedding file.
    Return a dict with words as keys and embedding vectors as value.

    params:
        glove_file  [str]
    return:
        [dict]      key are [str] and values are [np.array]
    """
    embedding   = {}

    with open( glove_file, 'r' ) as f:
        cnt = 0

        for l in f:
            word, vector        = l.split( maxsplit=1 )
            vector              = np.fromstring( vector ,sep=' ' )
            embedding[ word ]   = vector

            cnt += 1
            if not cnt % 10000:
                print( f"read {cnt:,} of 400,000 words" )

    print( "Done!" )
    return embedding

In [None]:
# takes about 2m 30s with the short file (the long file takes about 15m)
embedding   = read_glove( glove_file )

### Functions

In [None]:
def embed( word ):
    """
    Check if a word exists and return its embedding vector
    """
    if isinstance( word, str ):
        if word not in embedding.keys():
            return False
        return embedding[ word ]
    return word

In [None]:
def sim( word1, word2 ):
    """
    Compute similarity between two words.
    Return real number in range [-1, 1].
    (-1 = max similarity, 1 = no similarity).
    """
    word1   = embed( word1 )
    word2   = embed( word2 )
    s       = keras.losses.cosine_similarity( word1, word2 )
    return s.numpy()

In [None]:
def plus( word1, word2 ):
    """
    Compute the sum of the embedding vectors of two words
    """
    word1   = embed( word1 )
    word2   = embed( word2 )
    return word1 + word2

In [None]:
def minus( word1, word2 ):
    """
    Compute the difference of the embedding vectors of two words
    """
    word1   = embed( word1 )
    word2   = embed( word2 )
    return word1 - word2

In [None]:
def closest( word, n_words=5, limit=50000 ):
    """
    Given a word, find the N words that are "closest" to it.

    params:
        word                [str or np.array] input word
        n_words             [int] closest words to find
        limit               [int] max number of words to scan

    return:
        [list of tuples]    ( word [str], similarity score [float] )
    """
    word      = embed( word )
    cnt       = 0
    best      = [ ( None, 1.0 ) ]    # list of ( word, score )
    for w in embedding.keys():
        score       = sim( embedding[ w ], word )
        if ( score + 1 ) < 0.05:
            continue

        for i, ( v, s ) in enumerate( best ):
            if score < s:
                best.insert( i, ( w, score ) )
                del best[ n_words: ]
                break

        cnt += 1
        if not cnt % 1000:
            print( f"checked {cnt:,} of {limit:,} words" )
        if cnt > limit:
            print()
            break

    return best

### Test

In [None]:
embed( 'unicorn' )

In [None]:
sim( 'dog', 'wolf' )

In [None]:
sim( 'dog', 'galaxy' )

In [None]:
closest( 'queen', n_words=10, limit=20000 )

In [None]:
w = plus( 'woman', minus( 'king', 'man' ) )
sim( w, 'queen' )

In [None]:
w = plus( 'go', minus( 'did', 'do' ) )
closest( w, n_words=10, limit=20000 )

In [None]:
w = plus( 'gun', minus( 'person', 'goodness' ) )
closest( w, n_words=10, limit=20000 )

# Attention

### Classes

Single-head scaled dot-product attention

$A(Q,K,V)=\text{softmax}\left(\frac{QK^T}{\sqrt{d}}\right)V$

In [None]:
class Attention( keras.layers.Layer ):
    """
    Single-head attention layer using the scaled-dot product mechanism with normalization.
    Given the three matrices Query, Key, Value, compute the scores and the attention tensor.
    """

    def __init__( self, dim, **kwargs ):
        """
        Initialization of the class

        params:
            dim         [int] dimension (both internal and embedding)
        """
        super().__init__( **kwargs )
        self.dim        = dim


    def call( self, q, k, v ):
        """
        Compute the scaled dot-product attention.
        Default method for class keras.layers.Layer specifying what the layer does when applied to the input.

        params:
            q           [tensor]
            k           [tensor]
            v           [tensor]

        return:
            [tensor] attention
            [tensor] scores after softmax
        """
        scores          = tf.matmul( q, k, transpose_b=True )

        # normalization to give variance=1 to scores and att
        norm            = tf.math.sqrt( tf.cast( self.dim, tf.float32 ) )
        scores         /= norm

        scores          = tf.nn.softmax( scores )   # also called "attention matrix"
        att             = tf.matmul( scores, v )
        return att, scores

In [None]:
class MultiHeadAttention( keras.layers.Layer ):
    """
    Multi-head attention layer.
    Implement a simple attention mechanism several times in parallel.
    The independent attention outputs are concatenated and linearly transformed into the expected dimension.
    """

    def __init__( self, dim, n_heads, **kwargs ):
        """
        Initialization of the class.

        params:
            dim         [int] dimension (both internal and embedding)
            n_heads     [int] number of heads
        """
        super().__init__( **kwargs )
        self.dim        = dim
        self.n_heads    = n_heads
        self.attention  = Attention( self.dim )             # internal attention layer

        # a Dense layer without activation function is treated as a simple matrix multiplication
        self.Q          = keras.layers.Dense( self.dim, activation=None )
        self.K          = keras.layers.Dense( self.dim, activation=None )
        self.V          = keras.layers.Dense( self.dim, activation=None )
        self.O          = keras.layers.Dense( self.dim, activation=None )


    def _split_heads( self, x ):
        """
        Split a tensor into heads.
        Add a dimension and rearrange the shape.

        NOTE!
        Using x.shape does not work! The actual shape is given only by tf.shape( x )
        """

        # original shape is [ batch, seq_len, dim ]
        # note that 'seq_len' is not passed to the class! it is extracted automatically from the tensors
        original_shape  = tf.shape( x )

        # new shape is [ batch, seq_len, n_heads, new_dim ]
        # note that you don't need to specify the new size thanks to argument '-1' of tf.reshape()
        headed_shape    = ( original_shape[ 0 ], original_shape[ 1 ], self.n_heads, -1 )
        x               = tf.reshape( x, shape=headed_shape )

        # final shape is [ batch, n_heads, seq_len, new_dim ]
        x               = tf.transpose( x, perm=( 0, 2, 1, 3 ) )
        return x


    def _join_heads( self, x ):
        """
        Rejoin a tensor collapsing its heads.
        Basically reverts the effect of _split_heads().
        """
        x               = tf.transpose( x, perm=( 0, 2, 1, 3 ) )
        headed_shape    = tf.shape( x )
        original_shape  = ( headed_shape[ 0 ], headed_shape[ 1 ], self.dim )
        x               = tf.reshape( x, shape=original_shape )
        return x


    def call( self, q, k, v, return_matrix=False ):
        """
        Compute the multi-head attention.
        Default method for class keras.layers.Layer specifying what the layer does when applied to the input.

        params:
            q               [tensor]
            k               [tensor]
            v               [tensor]
            return_matrix   [bool] whether to return the attention matrix

        return:
            [tensor] attention output
            [tensor] OPTIONAL attention matrix (scores)
        """
        q               = self.Q( q )
        k               = self.K( k )
        v               = self.V( v )

        # split attention heads
        q               = self._split_heads( q )
        k               = self._split_heads( k )
        v               = self._split_heads( v )

        # compute attentions in parallel
        a, matrix       = self.attention( q, k, v )

        # join heads and compute result
        a               = self._join_heads( a )
        o               = self.O( a )

        if return_matrix:
            return o, matrix
        return o

### Usage

In [None]:
# attention with 2 heads and internal dimension 4
mha     = MultiHeadAttention( 4, 2 )

In [None]:
# random input sequence of 5 vetors of dimension 4
# [ batch=1, seq_len=5, dim=4 ]
x       = np.random.random( ( 1, 5, 4 ) )

print( x )

In [None]:
# the input is passed three times, to compute q, k and v separately
out, mtx    = mha( x, x, x, return_matrix=True )

In [None]:
# the scores organized into the "attention matrix"
# representing the correlations among the 5 elements of the input sequence

# [ batch, n_head, seq_len, seq_len ]
print( mtx )

In [None]:
# the output of the attention layer has the same dimensions of the input
# this is an intermediate output and it will be processed further into the Transformer
print( out )