In [19]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

In [21]:
B = 1  # 배치 사이즈
E = 30 # 워드임베딩 사이즈
T = 5 # 인풋 문장 길이(임의로 지정)
H = 50 # 히든 스테이트 사이즈

In [22]:
inputs = Variable(torch.randn(B,T,E))
hidden = Variable(torch.zeros(1,B,H))

In [23]:
gru = nn.GRU(E,H,batch_first=True)

In [26]:
encoder_hiddens,hidden = gru(inputs,hidden)

In [27]:
encoder_hiddens.size()

torch.Size([1, 5, 50])

In [28]:
decoder_hidden = Variable(torch.randn(1,B,H))

In [29]:
decoder_hidden.size()

torch.Size([1, 1, 50])

## Attention

일단 배치를 생략하고 구해본다

In [30]:
encoder_hiddens = encoder_hiddens.squeeze(0) # B 제거
decoder_hidden = decoder_hidden.squeeze(1)

print(encoder_hiddens.size(),decoder_hidden.size())

torch.Size([5, 50]) torch.Size([1, 50])


### 1. dot product 

$$e_{ti} = s_t^Th_i$$

In [31]:
scores=[]
for i in range(encoder_hiddens.size(0)): # 5번
    score = encoder_hiddens[i].dot(decoder_hidden[0])
    scores.append(score)
    
scores = torch.cat(scores)

In [32]:
scores # attention scores

Variable containing:
 0.4662
-0.7955
-0.9034
-1.2781
-1.5351
[torch.FloatTensor of size 5]

In [33]:
scores = encoder_hiddens.matmul(decoder_hidden.transpose(0,1)) # 행렬 연산으로도 가능
scores

Variable containing:
 0.4662
-0.7955
-0.9034
-1.2781
-1.5351
[torch.FloatTensor of size 5x1]

$$\alpha_{ti}^e=\frac{exp(e_{ti})}{\sum_{j=1}^n exp(e_{tj})}$$

In [35]:
attn_dist = F.softmax(scores,0)
print(attn_dist.sum()) # 합이 1

Variable containing:
 1.0000
[torch.FloatTensor of size 1]



$$c_t^e = \sum_i^n \alpha_{ti}^eh_i^e$$

In [38]:
context_vector = torch.matmul(attn_dist.transpose(0,1),encoder_hiddens) # 행렬곱으로 처리
print(context_vector.size())

torch.Size([1, 50])


In [39]:
context_vector

Variable containing:

Columns 0 to 9 
 0.1947 -0.0650  0.0412 -0.1787 -0.0420 -0.1517  0.2607  0.1308 -0.0690  0.0469

Columns 10 to 19 
 0.1727 -0.1003 -0.2290  0.2873  0.0191 -0.0743  0.0031  0.1621 -0.1171  0.2695

Columns 20 to 29 
-0.1095 -0.0033  0.2724  0.3487 -0.0757  0.2967  0.2220  0.2021 -0.0911 -0.2859

Columns 30 to 39 
-0.0298  0.1936  0.1735  0.0958  0.0696 -0.2303 -0.0988  0.1283 -0.1965  0.1781

Columns 40 to 49 
-0.2700  0.0346 -0.4003  0.2329  0.2719 -0.1430  0.0664  0.1250 -0.1281  0.2943
[torch.FloatTensor of size 1x50]

## TODO : General format의 Attention 짜보기 

1. $e_{ti} = s_t^TW_{attn}^eh_i$ # attention score
2. $\alpha_{ti}^e=\frac{exp(e_{ti})}{\sum_{j=1}^n exp(e_{tj})}$ # attention distribution
3. $c_t^e = \sum_i^n \alpha_{ti}^eh_i^e$ # context vector

In [5]:
encoder_hiddens = Variable(torch.randn(5,50))
decoder_hidden = Variable(torch.randn(1,50))

In [3]:
weight = nn.Parameter(torch.randn(50,50))

In [40]:
temp = torch.mm(decoder_hidden,weight)
score = torch.mm(temp,encoder_hiddens.transpose(0,1))

In [42]:
attn_dist = F.softmax(score,1)

In [43]:
context = torch.mm(attn_dist,encoder_hiddens)

In [44]:
context

Variable containing:

Columns 0 to 9 
 0.2467  0.0076  0.1370 -0.1971 -0.0970 -0.1184  0.3326  0.2212 -0.0090  0.0905

Columns 10 to 19 
 0.1392 -0.0486 -0.2314  0.2763 -0.0699  0.0065  0.0566  0.2061 -0.1480  0.2958

Columns 20 to 29 
-0.1526 -0.1308  0.3247  0.3574 -0.1501  0.2121  0.1747  0.1161 -0.1306 -0.1848

Columns 30 to 39 
-0.0611  0.1190  0.0670  0.1577  0.0643 -0.2923 -0.1487  0.0171 -0.2116  0.1260

Columns 40 to 49 
-0.3304  0.1175 -0.4394  0.1447  0.2981 -0.2352  0.2627  0.1207 -0.1804  0.2333
[torch.FloatTensor of size 1x50]

## Attention module 

In [45]:
from attention import Attention

In [46]:
B = 32
T = 10
H = 50

In [47]:
attn = Attention(50,method='general') # hidden size

In [48]:
encoder_hiddens = Variable(torch.randn(B,T,H))
decoder_hidden = Variable(torch.randn(B,1,H))

In [49]:
context_vector = attn(decoder_hidden,encoder_hiddens)

In [50]:
context_vector.size()

torch.Size([32, 1, 50])