# Lecture 4 Self-Attention

#### 2 Self-Attention Using PyTorch

Leveraging PyTorch, this example implements self-attention as a module, showcasing how it can be integrated into neural networks.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [2]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim, heads = 1):
        super(SelfAttention, self).__init__()
        assert embed_dim % heads == 0, "Embedding dimension must be divisible by number of heads."
        self.embed_dim = embed_dim
        self.heads = heads
        self.head_dim = embed_dim // heads

        # Define linear layers for queries, keys, and values
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

        # Output linear layer
        self.out = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        batch_size, seq_length, embed_dim = x.size()

        # Linear projections
        Q = self.query(x).view(batch_size, seq_length, self.heads, self.head_dim)
        K = self.key(x).view(batch_size, seq_length, self.heads, self.head_dim)
        V = self.value(x).view(batch_size, seq_length, self.heads, self.head_dim)

        # Transpose for attention calculation
        Q = Q.transpose(1, 2)  # (batch, heads, seq_length, head_dim)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        attention = F.softmax(scores, dim=-1)

        # Weighted sum of values
        out = torch.matmul(attention, V)  # (batch, heads, seq_length, head_dim)

        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, embed_dim)

        # Final linear layer
        out = self.out(out)
        return out

#### Example 1

In [3]:
import torch
import torch.nn.functional as F

# 1. Define your sentences
sentences = [ 
    "Cash is the king",
    "Do not time the market",
]

# 2. Tokenize the sentences
# Convert to lowercase and split by spaces
tokenized_sentences = [sentence.lower().split() for sentence in sentences]
print("Tokenized Sentences:", tokenized_sentences)

# 3. Build a vocabulary
# Assign a unique index to each unique token
# Reserve 0 for padding
vocab = {}
current_index = 1  # Start indexing from 1

for sentence in tokenized_sentences:
    for token in sentence:
        if token not in vocab:
            vocab[token] = current_index
            current_index += 1

print("Vocabulary:", vocab)

# Optionally, add an <UNK> token for unknown words
# vocab['<UNK>'] = current_index
# current_index += 1

# 4. Encode the sentences
# Replace each token with its corresponding index
encoded_sentences = []
for sentence in tokenized_sentences:
    encoded = [vocab.get(token, 0) for token in sentence]  # Use 0 if token not found
    encoded_sentences.append(encoded)

print("Encoded Sentences:", encoded_sentences)

# 5. Pad the sequences
# Find the length of the longest sentence
max_length = max(len(sentence) for sentence in encoded_sentences)

# Pad shorter sentences with 0 (padding index)
padded_sentences = [
    sentence + [0] * (max_length - len(sentence)) for sentence in encoded_sentences
]

print("Padded Sentences:", padded_sentences)

# Verify max_length is 5
print("Max Length:", max_length)

# 6. Convert to PyTorch tensor
encoded_tensor = torch.tensor(padded_sentences, dtype=torch.long)
print("Encoded Tensor:")
print(encoded_tensor)
print("Tensor Shape:", encoded_tensor.shape)  # Should be (2, 5)

# 7. One-Hot Encode the tensor
# Number of classes is the size of the vocabulary + 1 for padding (if not already included)
num_classes = len(vocab) + 1  # +1 for padding index 0

# Use F.one_hot to convert to one-hot vectors
# F.one_hot expects the class indices to be in the last dimension
one_hot_tensor = F.one_hot(encoded_tensor, num_classes=num_classes).float()

print("One-Hot Encoded Tensor:")
print(one_hot_tensor)
print("One-Hot Tensor Shape:", one_hot_tensor.shape)  # Should be (2, 5, 9)

Tokenized Sentences: [['cash', 'is', 'the', 'king'], ['do', 'not', 'time', 'the', 'market']]
Vocabulary: {'cash': 1, 'is': 2, 'the': 3, 'king': 4, 'do': 5, 'not': 6, 'time': 7, 'market': 8}
Encoded Sentences: [[1, 2, 3, 4], [5, 6, 7, 3, 8]]
Padded Sentences: [[1, 2, 3, 4, 0], [5, 6, 7, 3, 8]]
Max Length: 5
Encoded Tensor:
tensor([[1, 2, 3, 4, 0],
        [5, 6, 7, 3, 8]])
Tensor Shape: torch.Size([2, 5])
One-Hot Encoded Tensor:
tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1.]]])
One-Hot Tensor Shape: torch.Size([2, 5, 9])


**Explanation of Each Step:**
- Tokenization:
    - Converts each sentence to lowercase for consistency.
    - Splits sentences into lists of tokens (words).
- Vocabulary Building:
    - Iterates through all tokens and assigns a unique integer to each unique token.
    - Starts indexing from 1 to reserve 0 for padding purposes.
- Encoding:
    - Transforms each token in the sentences to its corresponding integer index based on the vocabulary.
    - Tokens not found in the vocabulary are replaced with 0 (you can use an UNK token for unknown words if desired).
- Padding:
    - Determines the length of the longest sentence to ensure all sequences are of equal length.
    - Pads shorter sentences with 0 (the padding index).
- One-Hot Encoding:
    - Converts the indexed tokens into one-hot vectors.
    - The torch.nn.functional.one_hot function is used for this purpose.
- The initial one-hot encoding includes the padding class, resulting in a tensor of shape (2, 5, 9). 

In [4]:
self_attention = SelfAttention(embed_dim = 9, heads = 1)
output = self_attention(one_hot_tensor)
print("Self-Attention Output:\n", output)

Self-Attention Output:
 tensor([[[ 0.2029,  0.1996, -0.2246, -0.0150, -0.4431, -0.0509, -0.0603,
           0.3259, -0.1087],
         [ 0.2048,  0.1972, -0.2192, -0.0119, -0.4437, -0.0512, -0.0606,
           0.3270, -0.1078],
         [ 0.2015,  0.1982, -0.2274, -0.0164, -0.4435, -0.0497, -0.0574,
           0.3256, -0.1110],
         [ 0.2043,  0.1970, -0.2212, -0.0132, -0.4424, -0.0496, -0.0596,
           0.3257, -0.1095],
         [ 0.2038,  0.1969, -0.2227, -0.0141, -0.4417, -0.0486, -0.0587,
           0.3251, -0.1106]],

        [[ 0.1540,  0.1029, -0.2362, -0.0197, -0.4905, -0.1299, -0.1160,
           0.2796, -0.1280],
         [ 0.1545,  0.1044, -0.2347, -0.0203, -0.4920, -0.1325, -0.1166,
           0.2800, -0.1266],
         [ 0.1523,  0.1078, -0.2344, -0.0217, -0.4923, -0.1314, -0.1166,
           0.2803, -0.1244],
         [ 0.1548,  0.1040, -0.2393, -0.0226, -0.4937, -0.1304, -0.1140,
           0.2813, -0.1292],
         [ 0.1540,  0.1053, -0.2335, -0.0207, -0.4929, -