In [1]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import CLIPTokenizer

tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
text = 'HEY THERE BABE MY NAME JEFF'

encoded = tokenizer.encode(text)

In [3]:
len(encoded)



8

In [4]:
import torch
import torch.nn as nn
from torch.nn import MultiheadAttention, TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer

class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__() 

        self.encoder = TransformerEncoder(
            encoder_layer=TransformerEncoderLayer(d_model=512, nhead=8),
            num_layers=6
        )

        self.linear = nn.Linear(512, 512)
    
    def forward(self):    
        return self.linear(self.encoder(encoded))
    

model = TextEncoder

torch.save(model.state_dict, './transformerenc.pth')

In [5]:
import numpy as np
import pandas as pd

L, d_k, d_v = 4,8,8
q = np.random.randn(L, d_k) # random number from the normal distribution
v = np.random.randn(L, d_v)
k = np.random.randn(L, d_k)

q
v
k

array([[-0.40814084, -0.37332957,  0.41736496,  0.56456025, -0.26873265,
        -0.01499254,  1.53621261, -1.10453972],
       [-0.03733441, -1.71067316, -0.06243443,  0.19111577,  0.2256944 ,
        -0.75196169, -0.22457096, -0.96500822],
       [-0.35982965,  0.12960284,  1.54211511, -1.08846447,  1.0357191 ,
        -0.77953158,  0.27429107,  0.16560335],
       [ 1.31532135,  0.05855913, -0.66361969, -0.47063576, -0.84205372,
         0.0708315 , -0.93978473,  0.93058742]])

In [6]:
# self-attention = ( scaled + masked ) softmax

# scaled 
scaled = np.matmul(q, k.T) / np.sqrt(d_k)
scaled

array([[ 0.02062012, -0.05819827,  0.91341086, -0.30045746],
       [ 1.34369878, -0.49830369, -0.95924016, -0.48568279],
       [ 1.08953297,  1.2021132 , -1.08492425, -0.17009201],
       [-0.58208582,  0.47509466,  0.06232977,  0.15541014]])

In [7]:
# masked, its for the decoder

# np.tril creates a triangular vector space
mask = np.tril(np.ones( (L,L) ))
mask[mask == 0] = -np.inf
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [8]:
scaled + mask  # -inf such that no context needs to be extracted.

array([[ 0.02062012,        -inf,        -inf,        -inf],
       [ 1.34369878, -0.49830369,        -inf,        -inf],
       [ 1.08953297,  1.2021132 , -1.08492425,        -inf],
       [-0.58208582,  0.47509466,  0.06232977,  0.15541014]])

In [9]:
# softmax function
# converts a vector into a probability dist, such that all row values add up to 1.

def softmax(x):
    return (np.exp(x).T)/np.sum(np.exp(x), axis=-1).T
softmax(scaled+mask)


array([[1.        , 0.86318536, 0.44786176, 0.12700329],
       [0.        , 0.13681464, 0.50122987, 0.3655465 ],
       [0.        , 0.        , 0.05090837, 0.24192521],
       [0.        , 0.        , 0.        , 0.265525  ]])

In [10]:
new_vector = np.matmul(softmax(scaled+mask), v)

print(new_vector, " new vector")
print("  ")
print(v, " original vector")

[[ 0.41170212 -0.73139916  0.39767492  1.15536245  0.80348307  0.67167837
   1.57912703 -3.20815522]
 [ 0.41225352 -0.21620246  0.89882512  0.55613937  0.32930705  0.86711631
   0.13231114 -0.48185702]
 [-0.10033232 -0.25774937  0.54043901 -0.06270601  0.00987469  0.57924564
  -0.11411772 -0.11104061]
 [-0.1679168  -0.30765231  0.57179873 -0.13739818  0.00828832  0.60921472
  -0.14761985 -0.11644557]]  new vector
  
[[-0.7595624  -0.68953117  0.45757243 -0.14960064 -1.08227542  1.49019217
   0.61954042 -1.38742086]
 [ 0.91325518 -0.10794262 -0.58458619  0.95114877  2.15638205 -1.53224216
   0.98575391 -1.99439988]
 [ 1.03440921  0.44312717  0.38228674  1.22730806  0.04563167  0.47492699
   0.4003609  -0.09712925]
 [-0.63239547 -1.15865668  2.15346481 -0.51745856  0.03121482  2.29437804
  -0.55595463 -0.43854842]]  original vector


In [None]:
import sentencepiece as sp 




In [22]:
##########################################################################################################################
#######################################################################################################################
# Multihead attention

import torch
import torch.nn as nn

sequence_length = 5
batch_size = 1
input_dim = 512
d_model = 512
x = torch.randn((batch_size, sequence_length, input_dim)) # random data
print(x.shape)

x = x.view(x.size(-1))
print(x.shape)

qkv_layer = nn.Linear(input_dim, 3)
q,k,v = qkv_layer(x)


torch.Size([1, 5, 512])


RuntimeError: shape '[512]' is invalid for input of size 2560

In [12]:
num_heads = 8
head_dim = d_model // num_heads
qkv = qkv.reshape(batch_size, sequence_length, num_heads, 3*head_dim)
qkv.shape
q,k,v = qkv
q
k
v

ValueError: not enough values to unpack (expected 3, got 1)