In [1]:
import numpy as np
from scipy.special import softmax

#### Step1 : 입력 표현

In [2]:
print("Step 1: Input : 3 inputs, d_model=4")
x = np.array([[1.0, 0.0, 1.0, 0.0],
              [0.0, 2.0, 0.0, 2.0],
              [1.0, 1.0, 1.0, 1.0]])
print(x)

Step 1: Input : 3 inputs, d_model=4
[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


#### Step2: 가중치 행렬 초기화

In [3]:
# Initialize Weight Matrice
# Q_w: For training Query
# K_w: For training Key
# V_w: For training Value

print("Step 2: weights 3 dimensions x d_mode=4")
print("w_query")
w_query = np.array([[1, 0, 1],
                   [1, 0, 0],
                   [0, 0, 1],
                   [0, 1, 1]])
print(w_query)

print("w_key")
w_key =np.array([[0, 0, 1],
                 [1, 1, 0],
                 [0, 1, 0],
                 [1, 1, 0]])
print(w_key)

print("w_value")
w_value = np.array([[0, 2, 0],
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])
print(w_value) 

Step 2: weights 3 dimensions x d_mode=4
w_query
[[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]
w_key
[[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]
w_value
[[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


#### Step3: Q, K, V를 얻기 위한 행렬곱

In [5]:
print("Step 3: Matrix multiplication to obtain Q, K, V")
print("Query: x * w_query")
Q = np.matmul(x, w_query)
print(Q)

print("\nKey: x * w_key")
K = np.matmul(x, w_key)
print(K)

print("\nValue: x * w_value")
V = np.matmul(x, w_value)
print(V)

Step 3: Matrix multiplication to obtain Q, K, V
Query: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]

Key: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]

Value: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [6]:
print("Step 4: Scaled Attention Scores")
# We floor d_k(=root(3) == 1.73) to 1 
d_k = 1
attention_scores = (Q @ K.transpose()) /d_k
print(attention_scores)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [8]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0] = softmax(attention_scores[0])
attention_scores[1] = softmax(attention_scores[1])
attention_scores[2] = softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.25010005 0.37494998 0.37494998]
[0.2133134  0.56950313 0.21718348]
[0.22037557 0.53143172 0.24819272]


In [15]:
print("Step 6: attention value obtained by score1/d_k * V") 
print(V[0])
print(V[1])
print(V[2])

print("Attention 1")
attention1 = attention_scores[0].reshape(-1, 1)
attention1 = attention_scores[0][0] * V[0]
print(attention1)

print("Attention 2")
attention2 = attention_scores[1].reshape(-1, 1)
attention2 = attention_scores[0][1] * V[1]
print(attention2)

print("Attention 3")
attention3 = attention_scores[2].reshape(-1, 1)
attention3 = attention_scores[0][2] * V[2]
print(attention3)

Step 6: attention value obtained by score1/d_k * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.25010005 0.5002001  0.75030014]
Attention 2
[0.74989995 2.99959981 0.        ]
Attention 3
[0.74989995 2.24969986 1.12484993]


In [11]:
print("Step 7: Summed the results to create the first line of the output matrix")
attention_input1 = attention1 + attention2 + attention3
print(attention_input1)

Step 7: Summed the results to create the first line of the output matrix
[1.74989995 5.74949976 1.87515007]


In [16]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
#We assume we have 3 results with learned weights (they were not trained in this example)
#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each
attention_head1=np.random.random((3, 64))
print(attention_head1)
     

Step 8: Step 1 to 7 for inputs 1 to 3
[[0.5970748  0.36367763 0.43239902 0.50020547 0.04042945 0.70879529
  0.92513627 0.76318703 0.82708775 0.73484174 0.90595519 0.11480428
  0.20385901 0.74245524 0.05028901 0.12783404 0.15433177 0.05920594
  0.01936851 0.53322801 0.74184568 0.7669367  0.64714052 0.62261756
  0.50542523 0.0400218  0.59374091 0.49236715 0.41725364 0.3636184
  0.21779469 0.1904669  0.22841884 0.75088335 0.59793212 0.06398795
  0.40317017 0.81376082 0.01989627 0.65705826 0.84048557 0.84560705
  0.69090466 0.74157831 0.7843926  0.81812891 0.11928947 0.17621668
  0.84839761 0.26591421 0.31259511 0.65462363 0.79157665 0.09330133
  0.75513028 0.65724665 0.62089579 0.90309095 0.69273112 0.06349903
  0.05366196 0.57612369 0.48392781 0.20870632]
 [0.2821444  0.89579449 0.44870095 0.05965957 0.69945599 0.60883746
  0.21746735 0.03466307 0.0032486  0.80601401 0.28667807 0.05168512
  0.55995755 0.71482176 0.89994773 0.84352259 0.70612986 0.52474358
  0.91015639 0.51168316 0.419265

In [17]:
print("Step 9: We assume we have trained the 8 heads of the attention sub-layer")
z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

Step 9: We assume we have trained the 8 heads of the attention sub-layer
shape of one head (3, 64) dimension of 8 heads 512


In [18]:
print("Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model")
output_attention = np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention)

Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model
[[0.66966339 0.21716048 0.6784728  ... 0.85023941 0.46794025 0.02574062]
 [0.94651686 0.3888484  0.03854622 ... 0.32152209 0.48739146 0.95762426]
 [0.25993755 0.12366173 0.87285598 ... 0.26282705 0.58397431 0.26263141]]


In [21]:
#@title Retrieve pipeline of modules and choose English to French translation
from transformers import pipeline
translator = pipeline("translation_en_to_fr")
print(translator("It is easy to translate languages with transformers", max_length=40))

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'translation_text': "Il est facile de traduire des langues à l'aide de transformateurs"}]
