In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l[K     |█████                           | 10 kB 23.6 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 27.9 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 13.0 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 10.0 MB/s eta 0:00:01[K     |█████████████████████████▌      | 51 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 1.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 10.9 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully ins

In [None]:
%%capture
!python -m spacy download en
!python -m spacy download de

In [None]:
import spacy

spacy_en = spacy.load('en') # 영어 토큰화
spacy_de = spacy.load('de') # 독어 토큰화

In [None]:
tokenized = spacy_en.tokenizer("I am a graduate student.")

for i, token in enumerate(tokenized):
  print(i, token)

0 I
1 am
2 a
3 graduate
4 student
5 .


In [None]:
#독어 토큰화 함수
def tokenize_de(text:str)->list:
  return [token.text for token in spacy_de.tokenizer(text)]

#영어 토큰화 함수
def tokenize_en(text:str)->list:
  return [token.text for token in spacy_en.tokenizer(text)]

In [None]:
from torchtext.data import Field, BucketIterator

#source : 독일어
SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
#Target : 영어
TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)

In [None]:
#Multi30k 영어-독어 번역 dataset
from torchtext.datasets import Multi30k

train_dataset, valid_dataset, test_dataset= Multi30k.splits(exts=(".de",".en"), fields=(SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 1.61MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 249kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 239kB/s]


In [None]:
print(f"학습 데이터셋(training dataset) 크기: {len(train_dataset.examples)}개")
print(f"평가 데이터셋(validation dataset) 크기: {len(valid_dataset.examples)}개")
print(f"테스트 데이터셋(testing dataset) 크기: {len(test_dataset.examples)}개")

학습 데이터셋(training dataset) 크기: 29000개
평가 데이터셋(validation dataset) 크기: 1014개
테스트 데이터셋(testing dataset) 크기: 1000개


In [None]:
#학습 데이터 중 하나를 선택해서 출력한다
print(vars(train_dataset.examples[30])['src'])
print(vars(train_dataset.examples[30])['trg'])

['ein', 'mann', ',', 'der', 'mit', 'einer', 'tasse', 'kaffee', 'an', 'einem', 'urinal', 'steht', '.']
['a', 'man', 'standing', 'at', 'a', 'urinal', 'with', 'a', 'coffee', 'cup', '.']


In [None]:
#Field 객체에서 최소 2번 이상 나온 단어에 대해 영어와 독어의 work dict 생성한다.
SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

print(f"len(SRC): {len(SRC.vocab)}")
print(f"len(TRG): {len(TRG.vocab)}")

{'freqs': Counter({'.': 28821, 'ein': 18850, 'einem': 13711, 'in': 11893, 'eine': 9908, ',': 8938, 'und': 8925, 'mit': 8843, 'auf': 8745, 'mann': 7805, 'einer': 6765, 'der': 4989, 'frau': 4186, 'die': 3948, 'zwei': 3873, 'einen': 3479, 'im': 3107, 'an': 3062, 'von': 2363, 'sich': 2273, 'dem': 2134, 'mädchen': 2121, 'junge': 2068, 'vor': 1936, 'zu': 1909, 'steht': 1778, 'männer': 1662, 'sitzt': 1624, 'hund': 1606, 'den': 1575, 'straße': 1412, 'während': 1397, 'gruppe': 1331, 'hält': 1310, 'spielt': 1307, 'das': 1261, 'hemd': 1202, 'personen': 1152, 'über': 1127, 'drei': 1078, 'eines': 1052, 'frauen': 993, 'blauen': 992, 'neben': 966, 'ist': 965, 'kind': 953, 'roten': 950, 'weißen': 950, 'stehen': 939, 'sitzen': 925, 'menschen': 924, 'am': 911, 'aus': 910, 'spielen': 910, 'durch': 895, 'bei': 885, 'geht': 857, 'trägt': 850, 'fährt': 835, 'wasser': 818, 'um': 795, 'kinder': 792, 'kleines': 772, 'person': 759, 'macht': 747, 'springt': 731, 'kleiner': 702, 'schwarzen': 702, 'entlang': 699, 

In [None]:
print(TRG.vocab.stoi["abcabc"]) # 없는 단어: 0
print(TRG.vocab.stoi[TRG.pad_token]) # 패딩(padding): 1
print(TRG.vocab.stoi["<sos>"]) # <sos>: 2
print(TRG.vocab.stoi["<eos>"]) # <eos>: 3
print(TRG.vocab.stoi["hello"])
print(TRG.vocab.stoi["world"])

0
1
2
3
4112
1752


In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

In [36]:
# 일반적인 데이터 로더(data loader)의 iterator와 유사하게 사용 가능

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size=BATCH_SIZE,
    device=device)
  

In [46]:
for i, batch in enumerate(train_iterator):
  src = batch.src
  trg = batch.trg

  print(src.shape)
  
  # 현재 배치에 있는 하나의 문장에 포함된 정보 출력
  for i in range(src.shape[1]):
    print(i, src[0][i].item())  # 여기에서는 [Seq_num, Seq_len]

  break # 첫 번째 배치만 확인

torch.Size([128, 27])
0 2
1 5
2 13
3 11
4 50
5 606
6 9
7 202
8 40
9 10
10 203
11 198
12 9
13 15
14 217
15 4
16 3
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1


In [47]:
import torch.nn as nn

class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hidden_dim, n_heads, dropout_ratio, device):
    super.__init__()

    assert hidden_dim % n_hreads == 0

    self.hidden_dim = hidden_dim  #임베딩 벡터 차원
    self.n_heads=n_heads  #헤드 수
    self.head_dim = hidden_dim // n_heads #각 헤드에서의 차원

    self.fc_q = nn.Linear(hidden_dim, hidden_dim) # Query 값에 적용될 FC 레이어
    self.fc_k = nn.Linear(hidden_dim, hidden_dim) # Key 값에 적용될 FC 레이어
    self.fc_v = nn.Linear(hidden_dim, hidden_dim) # Value 값에 적용될 FC 레이어

    self.fc_o = nn.Linear(hidden_dim, hidden_dim)

    self.dropout = nn.Dropout(dropout_ratio)

    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):
      batch_size = query.shape[0]

      Q = self.fc_q(query)
      K = self.fc_k(query)
      V = self.fc_v(query)

      Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
      K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)
      V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0,2,1,3)

      energy = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale

      if mask is not None:
        energy = energy.masked_fill(mask==0, -1e10)

      attention = torch.softmax(energy,dim=-1)

      x = torch.matmul(self.dropdout(attention), V)

      x = x.permute(0,2,1,3).contiguous()

      x = x.view(batch_size, -1, self.hidden_dim)

      x = self.fc_o(x)

      return x, attention