In [13]:
import pandas as pd
import numpy as np
import torch
import torch.functional as F
import torch.nn as nn

import math

from transformer import Positional_Encoding

torch.set_printoptions(precision=3) # Sets the precision of torch tensors to the thousands place

In [2]:
d_model = 8 # the length of the embedding dimension
max_length = 3000 # the total indexes we are producing for out positional encodings

data = torch.rand(20, 6).long() # Creating our data (20 sentences with 6 words in each sentence)
# [[w_11, w_12, ... w_1n]]
# [[w_21, ..., ...  ... ]]
# [[..., ..., ...   ... ]]
# [[..., ..., ...   ... ]]
# [[..., ..., ...   ... ]]
# [[w_m1, ..., ...  w_mn]]

In [3]:
# Assuming our vocab size = 40
# We create embedding dimensions of d_model, which in this case = 8
embedding_layer = nn.Embedding(40, embedding_dim=d_model)
embeddings = embedding_layer(data)

# create positional encodings = to the embedding dimensions (which is 8)
positional_layer = Positional_Encoding(d_model=d_model, max_length=max_length)

# Add upon our word embeddings to our positional_encodings
positional_encodings = positional_layer(embeddings)

In [4]:
# Scalar-Dot-Product Attention

# Create FeedForward Layer for Query, Key, Value weights
# The weights' dimensions all need to be the same dimensions (8x8)

query_weights = nn.Linear(d_model, d_model, bias=False)
key_weights = nn.Linear(d_model, d_model, bias=False)
value_weights = nn.Linear(d_model, d_model, bias=False)

# Implement Broadcasting Matrix Multiplication
# Should return the same dimensions for Q, K, V

Q = query_weights(positional_encodings)
K = key_weights(positional_encodings)
V = value_weights(positional_encodings)

In [5]:
# The query, value, key matrix should all be the same size
batch_size = Q.size(0)

num_heads = 2
d_keys = d_model // num_heads

# view is essentially reshape for pytorch
# heads in multi-head attention essentially act like workers as they divide up the embeddings into smaller groups 
# this allows faster performace 
# - 1 means length of the dimension

# Original Dimensions: [Batch_Size, Sentence_Length, Embedding_Dimensions (d_model)] 
print(f"Original Dimension: {Q.size()}")
Q = Q.view(batch_size, -1, num_heads, d_keys)
K = K.view(batch_size, -1, num_heads, d_keys)
V = V.view(batch_size, -1, num_heads, d_keys)

# Reshaped Dimensions: [Batch_Size, Sentence_Length, Num_Of_Heads, Embedding_Dimensions (d_model) / Num_Of_Heads]
print(f"Reshaped Dimension: {Q.size()}")

Original Dimension: torch.Size([20, 6, 8])
Reshaped Dimension: torch.Size([20, 6, 2, 4])


torch.Size([20, 6, 2, 2])

In [8]:
# Reording the Reshaped Dimension to be [Batch_Size, Num_Of_Heads, Sentence_Length, Embedding_Dimensions (d_model) / Num_Of_Heads]
print(f"Reshaped Dimension: {Q.size()}")

Q = Q.permute(0,2,1,3)
K = K.permute(0,2,1,3)
V = V.permute(0,2,1,3)

print(f"Permuted Dimension: {Q.size()}")

Reshaped Dimension: torch.Size([20, 6, 2, 4])
Permuted Dimension: torch.Size([20, 2, 6, 4])


In [28]:
K_T = K.permute(0,1,3,2)

scaled_dot_prod = (Q @ K_T) / math.sqrt(d_keys)
attention_probs = scaled_dot_prod.softmax(dim=-1)
attention_scores = attention_probs @ V

attention_scores.size()

torch.Size([20, 2, 6, 4])

In [29]:
class Scalar_Dot_Product_Attention(nn.Module):
    def __init__(self, d_model:int, mask=None) -> None:
        super().__init__()

        """
        d_model (int): the dimension of the word embeddings
        """
        
        self.d_model = d_model

        # Query, Key, Value Weights
        self.Qw = nn.Linear(d_model, d_model, bias=False)
        self.Kw = nn.Linear(d_model, d_model, bias=False)
        self.Vw = nn.Linear(d_model, d_model, bias=False)

    def forward(self, X:torch.Tensor()) -> torch.Tensor():
        """
        X (torch.Tensor): a Tensor that contains the sum between the Word Embeddings and Positional Encodings

        returns (torch.Tensor): Returns the attention score of the input X
        """
        Q = X @ self.Qw
        K = X @ self.Kw
        V = X @ self.Vw

        scaled_dot_prod = (Q @ K.permute(0,2,1)) / math.sqrt(d_model)
        attention_prob = scaled_dot_prod.softmax(dim=-1)

        attention_scores = attention_prob @ V

        return attention_scores

In [43]:
class Scalar_Product_Attention(nn.Module):
    def __init__(self, shape:tuple, normalized: bool=True):
        super().__init__()
        
        self.query_weights = nn.Parameter(torch.rand(shape))
        self.value_weights = nn.Parameter(torch.rand(shape))
        self.key_weights = nn.Parameter(torch.rand(shape))

        self.normalized = normalized
    
    def forward(self, x: torch.Tensor):

        query_matrix = x @ self.query_weights
        value_matrix = x @ self.value_weights
        key_matrix =  x @ self.key_weights
        
        dot_product = query_matrix @ key_matrix.T

        if self.normalized:
            dot_product = torch.divide(dot_product, torch.sqrt(key_matrix.size))
                
        softmax_dot_product = torch.nn.functional.softmax(dot_product, dim=1)

        attention_score = softmax_dot_product @ value_matrix

        return attention_score

In [40]:
scalar_product_attention = Scalar_Product_Attention((2,2), normalized=False)

In [17]:
scalar_product_attention.query_weights
scalar_product_attention.value_weights
scalar_product_attention.key_weights

Parameter containing:
tensor([[0.9998, 0.6929],
        [0.8881, 0.5199]], requires_grad=True)

In [18]:
scalar_product_attention.query_weights.size()
scalar_product_attention.value_weights.size()
scalar_product_attention.key_weights.size()

torch.Size([2, 2])

In [46]:
scalar_product_attention.forward(torch.rand(2,2))

tensor([[0.7761, 0.6186],
        [0.7742, 0.6172]], grad_fn=<MmBackward0>)