### Part 1. Dot-product similarity

In [None]:
import torch

In [None]:
dimension = 6
word_dictionary = {
    'I' : torch.ones(dimension),
    'am' : torch.ones(dimension)*2,
    'a' : torch.ones(dimension)*3,
    'cat' : torch.ones(dimension)*4,
    'dog' : torch.ones(dimension)*5,
}

data_1 = 'I am a cat'
data_2 = 'I am a dog'

for word, vector in word_dictionary.items():
  print(f'{word} : {vector}')

I : tensor([1., 1., 1., 1., 1., 1.])
am : tensor([2., 2., 2., 2., 2., 2.])
a : tensor([3., 3., 3., 3., 3., 3.])
cat : tensor([4., 4., 4., 4., 4., 4.])
dog : tensor([5., 5., 5., 5., 5., 5.])


Similarity between two vectors

In [None]:
# We are going to get 'dot-product' similarity.
# Let's start with vector-vector similarity.

token_1 = word_dictionary['I']
token_2 = word_dictionary['cat']

print(token_1)
print(token_2)

tensor([1., 1., 1., 1., 1., 1.])
tensor([4., 4., 4., 4., 4., 4.])


In [None]:
token_2.shape

torch.Size([6])

In [None]:
# How to calculate similarity between token_1 & token_2?
# Answer should be a single scalar.

######### Your Answer #########
sim = token_1 @ token_2.T
###############################

print(sim.shape)
print(sim)

torch.Size([])
tensor(24.)


  sim = token_1 @ token_2.T


Similarity among vectors

In [None]:
# Now calculate similarities among all the words in a sentence,
# 'I am a cat' (data_1)

tokens = []
for word in data_1.split():
  tokens.append(word_dictionary[word])
tokens = torch.stack(tokens, dim=0)

print(tokens.shape)
print(tokens)

torch.Size([4, 6])
tensor([[1., 1., 1., 1., 1., 1.],
        [2., 2., 2., 2., 2., 2.],
        [3., 3., 3., 3., 3., 3.],
        [4., 4., 4., 4., 4., 4.]])


In [None]:
# Answer should be [number_of_tokens x number_of_tokens] matrix.
# number_of_tokens = 4

######### Your Answer #########
sim = tokens @ tokens.T
###############################

print(sim.shape)
print(sim)

torch.Size([4, 4])
tensor([[ 6., 12., 18., 24.],
        [12., 24., 36., 48.],
        [18., 36., 54., 72.],
        [24., 48., 72., 96.]])


Similarity among vectors in batch form

In [None]:
# Now calculate similarities among all the words in two sentences,
# 'I am a cat' & 'I am a dog (data_1 & data_2) in batch form.

batch = []
for data in [data_1, data_2]:
  tokens = []
  for word in data.split():
    tokens.append(word_dictionary[word])
  tokens = torch.stack(tokens, dim=0)
  batch.append(tokens)
batch = torch.stack(batch, dim=0)

print(batch.shape)
print(batch)

torch.Size([2, 4, 6])
tensor([[[1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3.],
         [4., 4., 4., 4., 4., 4.]],

        [[1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3.],
         [5., 5., 5., 5., 5., 5.]]])


In [None]:
# Answer should be [batch x number_of_tokens x number_of_tokens] matrix.
# batch = 2, number_of_tokens = 4

######### Your Answer #########
# batch shape : [batch, number_of_tokens, dimension]
sim = batch @ batch.transpose(1,2) # or -2, -1
###############################

print(sim.shape)
print(sim)

torch.Size([2, 4, 4])
tensor([[[  6.,  12.,  18.,  24.],
         [ 12.,  24.,  36.,  48.],
         [ 18.,  36.,  54.,  72.],
         [ 24.,  48.,  72.,  96.]],

        [[  6.,  12.,  18.,  30.],
         [ 12.,  24.,  36.,  60.],
         [ 18.,  36.,  54.,  90.],
         [ 30.,  60.,  90., 150.]]])


### Part 2. Attention mechanism

In [None]:
import torch
import torch.nn as nn

In [None]:
dimension = 6
word_dictionary = {
    'I' : torch.ones(dimension),
    'am' : torch.ones(dimension)*2,
    'a' : torch.ones(dimension)*3,
    'cat' : torch.ones(dimension)*4,
    'dog' : torch.ones(dimension)*5,
}

data_1 = 'I am a cat'
data_2 = 'I am a dog'

for word, vector in word_dictionary.items():
  print(f'{word} : {vector}')

I : tensor([1., 1., 1., 1., 1., 1.])
am : tensor([2., 2., 2., 2., 2., 2.])
a : tensor([3., 3., 3., 3., 3., 3.])
cat : tensor([4., 4., 4., 4., 4., 4.])
dog : tensor([5., 5., 5., 5., 5., 5.])


In [None]:
batch = []
for data in [data_1, data_2]:
  tokens = []
  for word in data.split():
    tokens.append(word_dictionary[word])
  tokens = torch.stack(tokens, dim=0)
  batch.append(tokens)
batch = torch.stack(batch, dim=0)

print(batch.shape)
print(batch)

torch.Size([2, 4, 6])
tensor([[[1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3.],
         [4., 4., 4., 4., 4., 4.]],

        [[1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3.],
         [5., 5., 5., 5., 5., 5.]]])


In [None]:
# batch shape : [batch, number_of_tokens, dimension]
# Use nn.Linear to make Query, Key & Value.
# Remember, there will be no change in tensor shape.

######### Your Answer #########
to_query = nn.Linear(dimension, dimension)
to_key = nn.Linear(dimension, dimension)
to_value = nn.Linear(dimension, dimension)

query = to_query(batch)
key = to_key(batch)
value = to_value(batch)
###############################

print(f'Query : {query.shape}')
print(f'Key : {key.shape}')
print(f'Value : {value.shape}')

Query : torch.Size([2, 4, 6])
Key : torch.Size([2, 4, 6])
Value : torch.Size([2, 4, 6])


In [None]:
# Implement attention mechanism.
# As you've done in part 1,
# attention_score should be [batch x number_of_tokens x number_of_tokens]
# contextualized_tokens should be [batch, number_of_tokens, dimension]

######### Your Answer #########
attention_score = query @ key.transpose(-2,-1)
attention_score = attention_score.softmax(dim = -1) # 마지막 dimension.
contextualized_tokens = attention_score @ value
###############################

print(f'attention_score : {attention_score.shape}')
print(f'contextualized_tokens : {contextualized_tokens.shape}')

attention_score : torch.Size([2, 4, 4])
contextualized_tokens : torch.Size([2, 4, 6])
