### 데이터 전처리

In [2]:
from datasets import load_dataset

# HuggingFace 에서 데이터셋 로드
dataset = load_dataset("bentrevett/multi30k")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_dataset, validation_dataset, test_dataset = dataset['train'], dataset['validation'], dataset['test']

print(train_dataset[0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


- **Tokenizer** 및 **Vocab** 생성

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [6]:
# Word-level tokenizer 초기화
unknown_token = "<unk>"
pad_token, sos_token, eos_token = "<pad>", "<sos>", "<eos>"
special_tokens = [unknown_token, pad_token, sos_token, eos_token]

def generate_tokenizer() -> Tokenizer:
    tokenizer = Tokenizer(WordLevel(unk_token=unknown_token))
    tokenizer.pre_tokenizer = Whitespace()

    return tokenizer

en_tokenizer, de_tokenizer = generate_tokenizer(), generate_tokenizer()

In [7]:
# 학습용 trainer 설정 (vocab 생성)
# 단어 단위로 학습 trainer를 구성하고, 적어도 2개 이상 등장하는 단어들을 학습하도록 구성한다
trainer = WordLevelTrainer(special_tokens = special_tokens, min_frequency = 2)

In [8]:
# Tokenizer 학습
train_en, train_de = train_dataset['en'], train_dataset['de']

en_tokenizer.train_from_iterator(train_en, trainer=trainer)
de_tokenizer.train_from_iterator(train_de, trainer=trainer)

In [9]:
# Vocab Size 확인
print("[EN] vocab size: {}".format(en_tokenizer.get_vocab_size()))
print("[DE] vocab size: {}".format(de_tokenizer.get_vocab_size()))

# vocab token 예시 출력
print("[EN] Sample EN vocab tokens: {}".format(list(en_tokenizer.get_vocab().keys())[:10]))
print("[DE] Sample DE vocab tokens: {}".format(list(de_tokenizer.get_vocab().keys())[:10]))

[EN] vocab size: 6203
[DE] vocab size: 8060
[EN] Sample EN vocab tokens: ['Walkman', 'leafy', 'victim', 'site', 'cheers', 'crayola', 'specific', 'blow', 'leaving', 'company']
[DE] Sample DE vocab tokens: ['Cowboykleidung', 'Lebens', 'Skateboard', 'Alter', 'Musikgruppe', 'Wintermantel', 'Perle', 'Verkehr', 'Steinbank', 'Wanderausrüstung']


In [10]:
# 특수 토큰 인덱스 체크
for special_token in special_tokens:
    print("[EN] special token: {}, index: {}".format(special_token, en_tokenizer.get_vocab()[special_token]))
    print("[DE] special token: {}, index: {}".format(special_token, de_tokenizer.get_vocab()[special_token]))
    print("---")

[EN] special token: <unk>, index: 0
[DE] special token: <unk>, index: 0
---
[EN] special token: <pad>, index: 1
[DE] special token: <pad>, index: 1
---
[EN] special token: <sos>, index: 2
[DE] special token: <sos>, index: 2
---
[EN] special token: <eos>, index: 3
[DE] special token: <eos>, index: 3
---


- **하이퍼 파라미터** 정의

In [12]:
class ModelConfiguration:
    def __init__(self, 
                 max_len: int = 768, 
                 batch_size: int = 32, 
                 hidden_size: int = 512, 
                 ffn_size: int = 2048,
                 num_heads: int = 8, 
                 num_layers: int = 6, 
                 dropout_pb: float = 0.1, 
                 src_vocab_size: int = 0, 
                 trg_vocab_size: int = 0
                ):
        self.max_len = max_len
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.ffn_size = ffn_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout_pb = dropout_pb
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size

model_config = ModelConfiguration(src_vocab_size=de_tokenizer.get_vocab_size(), trg_vocab_size=en_tokenizer.get_vocab_size())

- **데이터 전처리**

In [14]:
_, en_pad_id, en_sos_id, en_eos_id = map(lambda special_token: en_tokenizer.token_to_id(special_token), special_tokens)
_, de_pad_id, de_sos_id, de_eos_id = map(lambda special_token: de_tokenizer.token_to_id(special_token), special_tokens)

In [15]:
# input: {"en" : "example_en", "de" : "example_de"}
def preprocess(dataset: dict) -> dict:
    max_len = model_config.max_len
    batch_size = model_config.batch_size
    
    # 토큰 id로 변환
    src_input_ids = de_tokenizer.encode(dataset['de']).ids
    trg_input_ids = en_tokenizer.encode(dataset['en']).ids

    # decoder 출력 부분에 special tokens 추가
    # I am a student 라는 문장이 있다면, 출력은 <sos> -> I, I -> am, ... 순으로 예측을 하기 때문
    decoder_input = [en_sos_id] + trg_input_ids
    labels = trg_input_ids + [en_eos_id]

    # padding
    encoder_input = src_input_ids[:max_len] + [de_pad_id] * max(0, max_len - len(src_input_ids))
    decoder_input = decoder_input[:max_len] + [en_pad_id] * max(0, max_len - len(decoder_input))
    labels = labels[:max_len] + [en_pad_id] * max(0, max_len - len(labels))

    # Attention mask (1 if real token else 0)
    encoder_attention_mask = [1 if token != de_pad_id else 0 for token in encoder_input]
    decoder_attention_mask = [1 if token != en_pad_id else 0 for token in decoder_input]

    return {
        "encoder_input_ids" : encoder_input,
        "encoder_attention_mask" : encoder_attention_mask,
        "decoder_input_ids" : decoder_input,
        "decoder_attention_mask" : decoder_attention_mask,
        "labels" : labels
    }

In [16]:
# 전처리 적용
train_dataset = train_dataset.map(preprocess, remove_columns=['en', 'de'])
validation_dataset = validation_dataset.map(preprocess, remove_columns=['en', 'de'])
test_dataset = test_dataset.map(preprocess, remove_columns=['en', 'de'])

In [17]:
print(train_dataset[10])

{'encoder_input_ids': [14, 5654, 10, 810, 28, 8, 19, 4270, 276, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [18]:
import torch

# collate function: 배치에 존재하는 값들의 텐서를 하나의 텐서로 병합하는 함수수
def collate_function(batch):
    return {
        key: torch.tensor([data[key] for data in batch], dtype=torch.long) for key in batch[0]
    }

In [19]:
from torch.utils.data import DataLoader

# DataLoader 설정
batch_size = model_config.batch_size

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_function)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_function)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_function)

In [20]:
# 배치 샘플 확인
batch = next(iter(train_loader))

for key, value in batch.items():
    print("{}: shape={}".format(key, value.shape))

encoder_input_ids: shape=torch.Size([32, 768])
encoder_attention_mask: shape=torch.Size([32, 768])
decoder_input_ids: shape=torch.Size([32, 768])
decoder_attention_mask: shape=torch.Size([32, 768])
labels: shape=torch.Size([32, 768])


## 토큰 임베딩
- Attention is all you need 에서 소개한 토큰 임베딩은 크게 token 자체 임베딩, 위치 임데딩 둘을 합쳐서 구현
- 이 때 토큰 임베딩 로직은 encoder, decoder 둘이 공유해야합니다

In [39]:
# 학습 device 정의
import torch

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

In [54]:
import torch.nn as nn

class Embeddings(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, max_len: int, dropout_prob: float):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)
        self.positional_embedding = nn.Embedding(max_len, hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(dropout_prob)

    # Input은 collate 형식의 배치가 들어감
    # input_ids = (batch_size, max_len)
    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        # 입력 sequence에 대한 positional ids 생성
        # positional_ids = (max_len) -> (1, max_len) -> (batch_size, max_len)
        # arange 하는 경우 0 부터 max_len - 1 까지의 텐서가 생성
        sequence_len = input_ids.size(1)
        positional_ids = torch.arange(sequence_len, device=device)
        positional_ids = positional_ids.unsqueeze(0)
        positional_ids = positional_ids.expand_as(input_ids)

        # 임베딩 : (batch_size, max_len) -> (batch_size, max_len, hidden_size)
        token_embeddings = self.token_embedding(input_ids)
        positional_embeddings = self.positional_embedding(positional_ids)

        # Add/Norm + Dropout
        embeddings = token_embeddings + positional_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings

In [56]:
# 임베딩 검증
embedding_layer = Embeddings(
    vocab_size=model_config.src_vocab_size,
    hidden_size=model_config.hidden_size,
    max_len=model_config.max_len,
    dropout_prob=model_config.dropout_pb
).to(device)

batch = next(iter(train_loader))
input_ids = batch['encoder_input_ids'].to(device)

embeddings = embedding_layer(input_ids)

# 결과 확인
print("Input shape: {}".format(input_ids.shape))
print("Embedding shape: {}".format(embeddings.shape))

Input shape: torch.Size([32, 768])
Embedding shape: torch.Size([32, 768, 512])


## Multi Head Attention 구현
- Transformer 아키텍처의 핵심인 멀티 헤드 어텐션을 구현합니다.

In [60]:
import torch.nn.functional as F

# Scaled dot product attention 구현
# 각 원소는 (batch_size, max_len, hidden_size)
def scaled_dot_product_attention(query: torch.Tensor,
                                 key: torch.Tensor,
                                 value: torch.Tensor,
                                 mask: torch.Tensor = None
                                ) -> torch.Tensor:
    # hidden_size
    dim_k = query.size(-1)

    # Attention score 계산 (batch_size, max_len, max_len)
    scores = torch.bmm(query, key.transpose(1, 2)) / (dim_k ** 0.5)

    # mask가 존재하면 -1e9를 더하여 무한소로 발산시킴
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    # softmax
    attention_weights = F.softmax(scores, dim=-1)

    # attention * value
    # (max_len, max_len) * (max_len, hidden_size) -> (max_len, hidden_size)
    # output.shape=(batch_size, max_len, hidden_size)
    output = torch.bmm(attention_weights, value)

    return output, attention_weights

- **Attentio**