In [1]:
import numpy as np

class TinyBERT:
    def __init__(self):
        self.word_to_id = {'the': 0, 'cat': 1, 'sits': 2, 'sleeps': 3}

        self.embeddings = np.array([
            [0.2, -0.5, 0.1],   # the
            [-0.3, 0.4, 0.2],   # cat
            [0.1, 0.3, -0.4],   # sits
            [-0.2, -0.1, 0.5]   # sleeps
        ])

        self.max_sequence_length = 10
        self.position_dim = 3
        self.position_embeddings = np.random.randn(self.max_sequence_length, self.position_dim) * 0.1

        self.n_heads = 2
        self.head_weights = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.head_importance = np.array([0.5, 0.5])

        self.q_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.k_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1
        self.v_vectors = np.random.randn(self.n_heads, 3, 3) * 0.1

    def attention(self, sentence):
        print('TinyBert --- attention()')
        print(sentence)
        # convert to embeddings
        word_ids = [self.word_to_id[word] for word in sentence]
        print(word_ids)
        vectors = self.embeddings[word_ids]
        print(vectors)

        positional_vectors = self.position_embeddings[:len(sentence)]
        vectors = vectors + positional_vectors
        print(vectors)

        # calculate attention scores by seeing how aligned they are in terms of drections
        # more similar words/embeddings pointing in same direction should have similar attentions
        scores = np.dot(vectors, vectors.T)
        print(scores)
        scores = scores / np.sqrt(3) # scale scores
        print(scores)

        # convert to probabilities
        probs = np.exp(scores)
        print(probs)
        attention_probs = probs / probs.sum(axis=1, keepdims=True)
        print(attention_probs)

        # work out new embeddings for words based on weighted sum of other words embeddings
        newEmbeddings = np.dot(attention_probs, vectors)
        print(newEmbeddings)
        return attention_probs, newEmbeddings

    def two_head_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        # Create weights for two heads
        head1_weights = np.random.randn(3, 3) * 0.1
        head2_weights = np.random.randn(3, 3) * 0.1

        # each head emphasizes a different aspect about the embedding - for example is an animal, is a verb, etc.
        head1_vectors = np.dot(vectors, head1_weights)
        head2_vectors = np.dot(vectors, head2_weights)

        # the emphasized vectors then are multiplied itself for attention scores. if head1 emphasizes animals, then the animals in the vector will have similar attention
        # the first step emphasizes attributes of embeddings for this step to detect them and allocate attention along similar words
        head1_scores = np.dot(head1_vectors, head1_vectors.T)
        head2_scores = np.dot(head2_vectors, head2_vectors.T)

        # scale vectors
        head1_vectors = head1_scores / np.sqrt(3)
        head2_vectors = head2_scores / np.sqrt(3)

        # convert to probabilities
        head1_probs = np.exp(head1_vectors)
        head2_probs = np.exp(head2_vectors)

        # attention_probs
        head1_probs = head1_probs / head1_probs.sum(axis=1, keepdims=True)
        head2_probs = head2_probs / head2_probs.sum(axis=1, keepdims=True)

        # new embeddings
        head1_new_vectors = np.dot(head1_probs, vectors)
        head2_new_vectors = np.dot(head2_probs, vectors)

        # average between what attention embeddings say
        newEmbeddings = (head1_new_vectors + head2_new_vectors)/2

        return head1_probs, head2_probs, newEmbeddings

    def n_head_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        head_outputs = []
        for i in range(self.n_heads):
            # each head emphasizes a different aspect about the embedding - for example is an animal, is a verb, etc.
            head_vectors = np.dot(vectors, self.head_weights[i])

            # the emphasized vectors then are multiplied itself for attention scores. if head1 emphasizes animals, then the animals in the vector will have similar attention
            # the first step emphasizes attributes of embeddings for this step to detect them and allocate attention along similar words
            head_scores = np.dot(head_vectors, head_vectors.T)

            # scale vectors
            head_vectors = head_scores / np.sqrt(3)

            # convert to probabilities
            head_probs = np.exp(head_vectors)

            # attention_probs
            head_probs = head_probs / head_probs.sum(axis=1, keepdims=True)

            # new embeddings
            head_new_vectors = np.dot(head_probs, vectors)

            head_outputs.append(head_new_vectors)

        # stack head outputs
        head_outputs = np.stack(head_outputs)

        # weight head outputs
        weighted_heads = head_outputs * self.head_importance[:, np.newaxis, np.newaxis]
        # for each dimension, sum up head elements
        newEmbeddings = weighted_heads.sum(axis=0)

        return head_outputs, newEmbeddings

    def QKV_attention(self, sentence):
        # Get word vectors like before
        word_ids = [self.word_to_id[word] for word in sentence]
        vectors = self.embeddings[word_ids] + self.position_embeddings[:len(sentence)]

        head_outputs = []
        for i in range(self.n_heads):
            # have to calculate qkv of vectors for head
            q = np.dot(vectors, self.q_weights[i])  # What each word is looking for
            k = np.dot(vectors, self.k_weights[i])  # What each word is advertising
            v = np.dot(vectors, self.v_weights[i])  # What each word has to offer

            # check out how similar query and key vectors are for this head
            # this is attention score - words that have info for each other are related
            queryKeySimilarity = np.dot(q, k.T)

           # Scale scores to prevent exploding gradients
            attention_scores = queryKeySimilarity / np.sqrt(3)

            # Convert to probabilities with softmax
            attention_probs = np.exp(attention_scores)
            attention_probs = attention_probs / attention_probs.sum(axis=1, keepdims=True)

            # new embedding by scaling value vectors by attention probabilities
            head_output = np.dot(attention_probs, v)

            head_outputs.append(head_output)

        # stack head outputs
        head_outputs = np.stack(head_outputs)

        # weight head outputs
        weighted_heads = head_outputs * self.head_importance[:, np.newaxis, np.newaxis]
        # for each dimension, sum up head elements
        newEmbeddings = weighted_heads.sum(axis=0)

        return head_outputs, newEmbeddings



    def calculate_head_loss(self, sentence, expected_relationships):
        """
        expected_relationships: list of tuples (word1_idx, word2_idx, relationship_strength)
        For example: [(0,1,1.0)] means word at index 0 should be strongly related to word at index 1
        """

        # Get current attention patterns
        head_attentions, final_embeddings = self.n_head_attention(sentence)

        loss = 0
        # For each expected relationship
        for word1_idx, word2_idx, expected_strength in expected_relationships:
            # Get similarity/attention between word embeddings
            actual_strength = np.dot(final_embeddings[word1_idx], final_embeddings[word2_idx])
            # Calculate difference from expected
            loss += (expected_strength - actual_strength) ** 2

        return loss

    def train_step(self, sentence, expected_relationships, learning_rate=0.01):
        # Store original weights
        original_weights = self.head_weights.copy()

        # Calculate original loss
        original_loss = self.calculate_head_loss(sentence, expected_relationships)

        # Initialize gradients array
        gradients = np.zeros_like(self.head_weights)

        # For each head, for each word, for each dimension
        for head in range(self.n_heads):
            for i in range(3):  # dimension size
                for j in range(3):
                    # Slightly modify weight to compute gradient
                    self.head_weights[head, i, j] += 0.0001
                    new_loss = self.calculate_head_loss(sentence, expected_relationships)

                    # Calculate and store gradient
                    gradients[head, i, j] = (new_loss - original_loss) / 0.0001

                    # Reset weight for next gradient calculation
                    self.head_weights[head, i, j] = original_weights[head, i, j]

        # Update all weights at once using calculated gradients
        self.head_weights = original_weights - learning_rate * gradients

In [11]:
bert = TinyBERT()
sentence = ['the', 'cat', 'sleeps']
head1_result, head2_result = bert.animal_attention_example()

Through animal-focused attention head:
horse: [1.2 0.2 0.2]
cat: [1. 0. 0.]
bike: [0.1 0.1 0.1]

Through ride-focused attention head:
horse: [0.2 1.2 0.2]
cat: [0.1 0.1 0.1]
bike: [0. 1. 0.]


TypeError: cannot unpack non-iterable NoneType object

In [10]:
bert = TinyBERT()
sentence = ['cat', 'sits', 'the', ]
weights, new_reps = bert.attention(sentence)

TinyBert --- attention()
['cat', 'sits', 'the']
[1, 2, 0]
[[-0.3  0.4  0.2]
 [ 0.1  0.3 -0.4]
 [ 0.2 -0.5  0.1]]
[[-0.25497954  0.4399019   0.11021631]
 [ 0.07694333  0.38871628 -0.50786617]
 [ 0.18630067 -0.41576185  0.31090431]]
[[ 0.27067589  0.09540292 -0.19613056]
 [ 0.09540292  0.41494866 -0.30517659]
 [-0.19613056 -0.30517659  0.30422735]]
[[ 0.1562748   0.0550809  -0.11323603]
 [ 0.0550809   0.23957072 -0.17619378]
 [-0.11323603 -0.17619378  0.17564574]]
[[1.16914744 1.05662609 0.89293987]
 [1.05662609 1.27070355 0.83845549]
 [0.89293987 0.83845549 1.1920157 ]]
[[0.37488133 0.33880192 0.28631675]
 [0.33376431 0.40138654 0.26484915]
 [0.30544451 0.28680725 0.40774824]]
[[-0.01617752  0.17756925 -0.04173089]
 [-0.00487748  0.19273487 -0.08472163]
 [ 0.02014957  0.07632611  0.01477595]]


In [11]:
print("\nAttention weights:")
for i, word in enumerate(sentence):
    print(f"\n{word} pays attention to each word:")
    for j, other_word in enumerate(sentence):
        print(f"{other_word}: {weights[i][j]:.3f}")


Attention weights:

cat pays attention to each word:
cat: 0.375
sits: 0.339
the: 0.286

sits pays attention to each word:
cat: 0.334
sits: 0.401
the: 0.265

the pays attention to each word:
cat: 0.305
sits: 0.287
the: 0.408
