In [1]:
import  os
import  numpy               as np
import  tensorflow          as tf
from    tensorflow          import keras

# Embedding

### Functions

In [2]:
# https://nlp.stanford.edu/projects/glove/
glove_file  = "glove.6B.50d.txt"

!wget -O {glove_file} https://www.dropbox.com/scl/fi/y328s9lbz8c9glp7al02p/glove.6B.50d.txt?rlkey=m81pwe06f8tpb947fl3y78nlo&dl=0

--2024-02-29 06:59:33--  https://www.dropbox.com/scl/fi/y328s9lbz8c9glp7al02p/glove.6B.50d.txt?rlkey=m81pwe06f8tpb947fl3y78nlo
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:601b:18::a27d:812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc9f475ff404db2f988fd30ff52c.dl.dropboxusercontent.com/cd/0/inline/COKsc3v9WDFSUUZcy6rqfHmAthbVdqWn2SAlCIu2TeXbNnJO4hZw-Jqdx8A9uveI3UV4sNOa-n9W_IXtoCQeznWsS17LmNSw2Ias_tEsh3EUl_ohWi9Z81inxSnVgSdwDwQ/file# [following]
--2024-02-29 06:59:34--  https://uc9f475ff404db2f988fd30ff52c.dl.dropboxusercontent.com/cd/0/inline/COKsc3v9WDFSUUZcy6rqfHmAthbVdqWn2SAlCIu2TeXbNnJO4hZw-Jqdx8A9uveI3UV4sNOa-n9W_IXtoCQeznWsS17LmNSw2Ias_tEsh3EUl_ohWi9Z81inxSnVgSdwDwQ/file
Resolving uc9f475ff404db2f988fd30ff52c.dl.dropboxusercontent.com (uc9f475ff404db2f988fd30ff52c.dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6019:15::a27d:40f
Connecting to uc9

In [3]:
# read the embedding file
# return a dict with words (strings) as keys and embedding vectors (np.array) as value
def read_glove( glove_file ):
    embedding   = {}
    with open( glove_file, 'r' ) as f:
        cnt = 0
        for l in f:
            word, vector        = l.split( maxsplit=1 )
            vector              = np.matrix( vector ).A1  # convert to normal np.array
            embedding[ word ]   = vector

            cnt += 1
            if not cnt % 10000:
                print( f"read {cnt:,} of 400,000 words" )

    print( "Done!")
    return embedding

In [4]:
# takes about 2m 30s with the short file
# (the long file takes about 15m)
embedding   = read_glove( glove_file )

read 10,000 of 400,000 words
read 20,000 of 400,000 words
read 30,000 of 400,000 words
read 40,000 of 400,000 words
read 50,000 of 400,000 words
read 60,000 of 400,000 words
read 70,000 of 400,000 words
read 80,000 of 400,000 words
read 90,000 of 400,000 words
read 100,000 of 400,000 words
read 110,000 of 400,000 words
read 120,000 of 400,000 words
read 130,000 of 400,000 words
read 140,000 of 400,000 words
read 150,000 of 400,000 words
read 160,000 of 400,000 words
read 170,000 of 400,000 words
read 180,000 of 400,000 words
read 190,000 of 400,000 words
read 200,000 of 400,000 words
read 210,000 of 400,000 words
read 220,000 of 400,000 words
read 230,000 of 400,000 words
read 240,000 of 400,000 words
read 250,000 of 400,000 words
read 260,000 of 400,000 words
read 270,000 of 400,000 words
read 280,000 of 400,000 words
read 290,000 of 400,000 words
read 300,000 of 400,000 words
read 310,000 of 400,000 words
read 320,000 of 400,000 words
read 330,000 of 400,000 words
read 340,000 of 400

In [5]:
# functions to play with embeddings

# check if word exists and return its embedding vector
def embed( word ):
    if isinstance( word, str ):
        if word not in embedding.keys():
            return False
        return embedding[ word ]
    return word


# compute similarity between two words [-1, 1]
# -1 = max similarity, 1 = no similarity
def sim( word1, word2 ):
    word1   = embed( word1 )
    word2   = embed( word2 )
    s       = keras.losses.cosine_similarity( word1, word2 )
    return s.numpy()


def plus( word1, word2 ):
    word1   = embed( word1 )
    word2   = embed( word2 )
    return word1 + word2


def minus( word1, word2 ):
    word1   = embed( word1 )
    word2   = embed( word2 )
    return word1 - word2


# get the "closest" words to a given word
def closest( word, n_words=5, limit=50000 ):
    word      = embed( word )
    cnt       = 0
    best      = [ ( None, 1.0 ) ]    # list of ( word, score )
    for w in embedding.keys():
        score       = sim( embedding[ w ], word )
        if ( score + 1 ) < 0.05:
            continue

        for i, ( v, s ) in enumerate( best ):
            if score < s:
                best.insert( i, ( w, score ) )
                del best[ n_words: ]
                break

        cnt += 1
        if not cnt % 1000:
            print( f"checked {cnt:,} of {limit:,} words" )
        if cnt > limit:
            print()
            break

    return best

### Test

In [6]:
embed( 'unicorn' )

array([ 0.9628   , -0.7262   , -1.0204   , -0.12449  ,  0.5857   ,
        0.24912  ,  0.1899   , -0.53152  ,  0.27576  , -0.2448   ,
        0.56746  ,  0.56175  , -0.066513 ,  0.42018  , -0.14651  ,
       -0.30215  , -0.045488 ,  0.79528  , -1.1222   ,  0.068902 ,
       -0.08587  , -0.16382  , -0.3923   ,  0.13409  , -0.43265  ,
        0.0018246, -1.2473   ,  0.32282  ,  0.074383 , -0.70834  ,
       -0.06777  ,  0.10383  , -0.35492  ,  0.66333  , -0.22843  ,
        0.57841  ,  0.0057448, -1.2199   , -0.31636  , -0.47167  ,
        0.38949  , -0.26445  , -0.32037  , -0.70928  ,  0.39919  ,
       -0.14546  ,  0.67401  , -0.67832  , -0.016107 , -0.54813  ])

In [7]:
sim( 'dog', 'wolf' )

-0.6943803101202853

In [8]:
sim( 'dog', 'galaxy' )

-0.13304946376525956

In [9]:
closest( 'queen', n_words=10, limit=20000 )

checked 1,000 of 20,000 words
checked 2,000 of 20,000 words
checked 3,000 of 20,000 words
checked 4,000 of 20,000 words
checked 5,000 of 20,000 words
checked 6,000 of 20,000 words
checked 7,000 of 20,000 words
checked 8,000 of 20,000 words
checked 9,000 of 20,000 words
checked 10,000 of 20,000 words
checked 11,000 of 20,000 words
checked 12,000 of 20,000 words
checked 13,000 of 20,000 words
checked 14,000 of 20,000 words
checked 15,000 of 20,000 words
checked 16,000 of 20,000 words
checked 17,000 of 20,000 words
checked 18,000 of 20,000 words
checked 19,000 of 20,000 words
checked 20,000 of 20,000 words



[('princess', -0.851516638750669),
 ('lady', -0.8050609250765663),
 ('elizabeth', -0.7873042176943493),
 ('king', -0.7839043010964117),
 ('prince', -0.7821860976090151),
 ('coronation', -0.7692777928548158),
 ('consort', -0.7626097498967264),
 ('royal', -0.744286480167597),
 ('crown', -0.7382649680186562),
 ('victoria', -0.7285771630710364)]

In [18]:
w = minus( 'king', 'man' )
w = plus( w, 'woman' )
sim( w, 'queen' )

-0.8609581258578944

# Attention

### Classes

In [12]:
class Attention( keras.layers.Layer ):
    """
    this class implements a dot-product attention layer:
        A( Q, K, V ) = softmax( ( Q K^T ) / sqrt( dim ) ) V
    """

    def __init__( self, dim, **kwargs ):
        """
        initialization of the class
        inputs:
            dim         [int] internal dimension (and input vector size / embedding)
        """
        super().__init__( **kwargs )
        self.dim        = dim


    def call( self, q, k, v ):
        """
        compute the scaled dot-product attention
        inputs:
            q           [tensor] query
            k           [tensor] key
            v           [tensor] value
        """
        scores          = tf.matmul( q, k, transpose_b=True )
        norm            = tf.math.sqrt( tf.cast( self.dim, tf.float32 ) ) # to normalize
        scores          /= norm
        softm           = tf.nn.softmax( scores )
        att             = tf.matmul( softm, v )
        return att, softm   # output, weigth matrix


In [13]:
class MultiHeadAttention( keras.layers.Layer ):
    """
    this class implements a multi-head attention layer
    """

    def __init__( self, dim, n_heads, **kwargs ):
        """
        initialization of the class, with all the linear algebra ingredients for query, key, value, output
        inputs:
            dim         [int] dimension
            n_heads     [int] number of heads
        """
        super().__init__( **kwargs )
        self.dim        = dim
        self.n_heads    = n_heads
        self.attention  = Attention( self.dim )
        self.Q          = keras.layers.Dense( self.dim )    # without activation function, just a matrix
        self.K          = keras.layers.Dense( self.dim )    # just a matrix
        self.V          = keras.layers.Dense( self.dim )    # just a matrix
        self.O          = keras.layers.Dense( self.dim )    # just a matrix


    def _split_heads( self, x ):    # la seq_length non viene mai passata da nessuno! se la ricava da solo tramite tensori
        """
        split a tensor into heads just by adding a dimension and rearranging the shape
        NOTE the special use of -1 in tf.reshape that compute automatically the size of a dimension
        NOTE that it does not work using x.shape for retrieving the original shape, tf.shape() should be used
        """
        original_shape  = tf.shape( x )                                                     # batch X seq_length X dim
        headed_shape    = ( original_shape[ 0 ], original_shape[ 1 ], self.n_heads, -1 )    # batch X seq_length X n_head X new_dim
        x               = tf.reshape( x, shape=headed_shape )
        x               = tf.transpose( x, perm=( 0, 2, 1, 3 ) )                            # batch X n_head X seq_length X new_dim
        return x


    def _join_heads( self, x ):
        """
        rejoin a tensor collapsing its heads
        essentialy reverts the operations of _split_heads()
        NOTE see _split_heads()
        """
        x               = tf.transpose( x, perm=(0, 2, 1, 3) )
        headed_shape    = tf.shape( x )
        original_shape  = ( headed_shape[ 0 ], headed_shape[ 1 ], self.dim )
        x               = tf.reshape( x, shape=original_shape )
        return x


    def call( self, q, k, v, return_matrix=False ):
        """
        compute the multi-head attention
        inputs:
            q               [tensor] query
            k               [tensor] key
            v               [tensor] value
            return_matrix   [bool]  if True return also the attention matrix
        """
        q               = self.Q( q )
        k               = self.K( k )
        v               = self.V( v )

        q               = self._split_heads( q )
        k               = self._split_heads( k )
        v               = self._split_heads( v )

        a, matrix       = self.attention( q, k, v )
        a               = self._join_heads( a )
        o               = self.O( a )
        if return_matrix:
            return o, matrix
        return o


### Test

In [14]:
mha     = MultiHeadAttention( 4, 2 )
x       = 0.1 * np.random.random( ( 1, 5, 4 ) )
x

array([[[0.03903032, 0.01083386, 0.02229455, 0.06199888],
        [0.05615571, 0.00544831, 0.00417441, 0.00984983],
        [0.01555901, 0.06852272, 0.00036552, 0.05989993],
        [0.0623731 , 0.02540096, 0.06532834, 0.08206647],
        [0.052377  , 0.0748466 , 0.02227033, 0.04393784]]])

In [15]:
o, m    = mha( x, x, x, return_matrix=True )
m       # batch X n_head X seq_length X seq_length

<tf.Tensor: shape=(1, 2, 5, 5), dtype=float32, numpy=
array([[[[0.20000912, 0.20003867, 0.20000707, 0.19995843, 0.19998673],
         [0.20003115, 0.20000991, 0.1999941 , 0.20000437, 0.19996051],
         [0.20001464, 0.2001047 , 0.2000218 , 0.19988224, 0.19997664],
         [0.20008124, 0.20000291, 0.19997898, 0.20003876, 0.19989805],
         [0.20008594, 0.20005971, 0.19999167, 0.19997318, 0.19988948]],

        [[0.200186  , 0.1998892 , 0.19997014, 0.20018448, 0.19977021],
         [0.20003492, 0.19999199, 0.20007287, 0.19993548, 0.19996472],
         [0.2002897 , 0.19981346, 0.19986844, 0.20039484, 0.19963354],
         [0.2002908 , 0.19983216, 0.19998638, 0.2002465 , 0.19964418],
         [0.20027527, 0.19983636, 0.1999581 , 0.20027001, 0.19966021]]]],
      dtype=float32)>

In [16]:
x[ 0 ][ 0 ] = 8.4 * np.ones( ( 4, ) )
x[ 0 ][ 3 ] = 7.8 * np.ones( ( 4, ) )
x

array([[[8.40000000e+00, 8.40000000e+00, 8.40000000e+00, 8.40000000e+00],
        [5.61557130e-02, 5.44831296e-03, 4.17441441e-03, 9.84982796e-03],
        [1.55590064e-02, 6.85227169e-02, 3.65516845e-04, 5.98999318e-02],
        [7.80000000e+00, 7.80000000e+00, 7.80000000e+00, 7.80000000e+00],
        [5.23770017e-02, 7.48466040e-02, 2.22703255e-02, 4.39378449e-02]]])

In [17]:
o, m    = mha( x, x, x, return_matrix=True )
m

<tf.Tensor: shape=(1, 2, 5, 5), dtype=float32, numpy=
array([[[[2.5880080e-08, 3.4884879e-01, 3.3941177e-01, 8.3795584e-08,
          3.1173933e-01],
         [1.9494672e-01, 2.0319791e-01, 2.0318189e-01, 1.9552571e-01,
          2.0314774e-01],
         [1.7985354e-01, 2.1278106e-01, 2.1269292e-01, 1.8202762e-01,
          2.1264489e-01],
         [8.3339138e-08, 3.4774289e-01, 3.3899918e-01, 2.4811752e-07,
          3.1325760e-01],
         [1.8050894e-01, 2.1237469e-01, 2.1230248e-01, 1.8261985e-01,
          2.1219397e-01]],

        [[4.5973822e-11, 3.3471370e-01, 3.7386197e-01, 2.3580152e-10,
          2.9142430e-01],
         [1.8830389e-01, 2.0734446e-01, 2.0742832e-01, 1.8960713e-01,
          2.0731619e-01],
         [1.9555975e-01, 2.0282724e-01, 2.0288306e-01, 1.9608536e-01,
          2.0264462e-01],
         [2.3279323e-10, 3.3472875e-01, 3.7093645e-01, 1.0624041e-09,
          2.9433474e-01],
         [1.8540958e-01, 2.0920554e-01, 2.0933300e-01, 1.8703076e-01,
          