In [3]:
import numpy as np
import torch 
import torch.nn as nn

from transformer import Positional_Encoding
import math

torch.set_printoptions(precision=3) # Sets the precision of torch tensors to the thousands place

In [7]:
d_model = 8 # the length of the embedding dimension
max_length = 3000 # the total indexes we are producing for out positional encodings

data = torch.rand(20, 6).long() # Creating our data (20 sentences with 6 words in each sentence)
# [[w_11, w_12, ... w_1n]]
# [[w_21, ..., ...  ... ]]
# [[..., ..., ...   ... ]]
# [[..., ..., ...   ... ]]
# [[..., ..., ...   ... ]]
# [[w_m1, ..., ...  w_mn]]

In [8]:
# Assuming our vocab size = 40
# We create embedding dimensions of d_model, which in this case = 8
embedding_layer = nn.Embedding(40, embedding_dim=d_model)
embeddings = embedding_layer(data)

# create positional encodings = to the embedding dimensions (which is 8)
positional_layer = Positional_Encoding(d_model=d_model, max_length=max_length)

# Add upon our word embeddings to our positional_encodings
positional_encodings = positional_layer(embeddings)

In [9]:
# Scalar-Dot-Product Attention

# Create FeedForward Layer for Query, Key, Value weights
# The weights' dimensions all need to be the same dimensions (8x8)

query_weights = nn.Linear(d_model, d_model, bias=False)
key_weights = nn.Linear(d_model, d_model, bias=False)
value_weights = nn.Linear(d_model, d_model, bias=False)

# Implement Broadcasting Matrix Multiplication
# Should return the same dimensions for Q, K, V

Q = query_weights(positional_encodings)
K = key_weights(positional_encodings)
V = value_weights(positional_encodings)

In [10]:
# The query, value, key matrix should all be the same size
batch_size = Q.size(0)

num_heads = 2
d_keys = d_model // num_heads

# view is essentially reshape for pytorch
# heads in Multi-Head Attention essentially act like workers as they divide up the embeddings into smaller groups 
# this allows faster performace 
# - 1 means length of the dimension

# Original Dimensions: [Batch_Size, Sentence_Length, Embedding_Dimensions (d_model)] 
print(f"Original Dimension: {Q.size()}")
Q = Q.view(batch_size, -1, num_heads, d_keys)
K = K.view(batch_size, -1, num_heads, d_keys)
V = V.view(batch_size, -1, num_heads, d_keys)

# Reshaped Dimensions: [Batch_Size, Sentence_Length, Num_Of_Heads, Embedding_Dimensions (d_model) / Num_Of_Heads]
print(f"Reshaped Dimension: {Q.size()}")

Original Dimension: torch.Size([20, 6, 8])
Reshaped Dimension: torch.Size([20, 6, 2, 4])


In [11]:
class Multi_Head_Attention(nn.Module):
    def __init__(self, d_model:int, heads:int:4, dropout_value:int=0.1, mask=None) -> None:
        super().__init__()

        """
        d_model (int): the dimension of the word embeddings
        heads (int): the number of heads 
        """

        # Dimension of Embedding
        self.d_model = d_model

        # Number of Heads in our Multi-Head-Attention
        self.heads = heads

        # Query, Key, Value Weights
        self.Qw = nn.Linear(d_model, d_model, bias=False)
        self.Kw = nn.Linear(d_model, d_model, bias=False)
        self.Vw = nn.Linear(d_model, d_model, bias=False)
        self.Ow = nn.Linear(d_model, d_model, bias=False)

        # 
        self.drop_out = nn.Dropout(p=dropout_value)

    def forward(self, X:torch.Tensor()) -> torch.Tensor():
        """
        X (torch.Tensor): a Tensor that contains the sum between the Word Embeddings and Positional Encodings

        Returns (torch.Tensor): Returns the attention score of the input X
        """

        # Query, Key, Value Matrix
        Q = X @ self.Qw
        K = X @ self.Kw
        V = X @ self.Vw

        # Batch_Size should be the same for Query, Key, and Value Matrix
        batch_size = Q.size(0)

        # Creates the key dimensions
        d_keys = d_model // self.heads

        Q = Q.view(batch_size, -1, self.heads, d_keys).permute(0,2,1,3)
        K = K.view(batch_size, -1, self.heads, d_keys).permute(0,2,1,3)
        V = V.view(batch_size, -1, self.heads, d_keys).permute(0,2,1,3)

        scaled_dot_prod = (Q @ K.permute(0,2,1)) / math.sqrt(d_keys)
        attention_prob = scaled_dot_prod.softmax(dim=-1)

        
        A = self.drop_out(attention_prob @ V)
        A = A.permute(0,2,1,3).view(batch_size, -1, self.heads * d_keys)
        
        attention_scores = self.Ow(A)
        
        return attention_scores

SyntaxError: invalid syntax (4041656035.py, line 2)