# 자연어처리 과제 3 (6주차)
* 과제는 해당 .ipynb 파일에 코드 작성
    * 코드는 google colab의 gpu를 사용하는 런타임 환경에서 모두실행을 통해 한번에 실행 되어야함
    * 생성형 AI (ChatGPT, Copilot, Claude, ...) 등 사용 가능
        * 단, 사용시 사용한 방법, 입력, 출력을 캡처해 보고서에 기입
* Word를 통해 자유형식으로 보고서를 작성
    * 보고서의 양식은 자유
    * 보고서의 제출은 .pdf 형식으로 제출해야하며, 파일명은 "학번_이름_HW_??.pdf"로 제출 할 것
    * 보고서에 코드를 그대로 복붙 하지 말 것 (캡처 도구를 활용, 환경 설치 자료 참고)
* .ipynb와 .pdf 파일을 el을 통해 제출
    * 예시 : "2232036006_임상훈_HW_01.ipynb"와 "2232036006_임상훈_HW_01.pdf"를 제출

## 관련 코드

In [1]:
import os 
import re
import shutil
import zipfile

import numpy as np
import unicodedata
import urllib3
import random
import math

In [2]:
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        print(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
output_path = "fra-eng.zip"
download_zip(url, output_path)

path = os.getcwd()
zipfilename = os.path.join(path, output_path)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

ZIP file downloaded to fra-eng.zip


In [3]:
def to_ascii(s):
  # 프랑스어 악센트(accent) 삭제
  # 예시 : 'déjà diné' -> deja dine
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  # 악센트 제거 함수 호출
  sent = to_ascii(sent.lower())

  # 단어와 구두점 사이에 공백 추가.
  # ex) "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환.
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # 다수 개의 공백을 하나의 공백으로 치환
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [4]:
def load_preprocessed_data():
  encoder_input, decoder_input = [], []

  with open("fra.txt", "r") as lines:
    for i, line in enumerate(lines):
      # source 데이터와 target 데이터 분리
      src_line, tar_line, _ = line.strip().split('\t')

      # source 데이터 전처리
      src_line = [w for w in preprocess_sentence(src_line).split()]

      # target 데이터 전처리
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("[SOS] " + tar_line + " [EOS]").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)

  return encoder_input, decoder_input

sents_en_in, sents_fra_in = load_preprocessed_data()

In [116]:
#split data -> train-validation-test로 구분
def split_data(data, train_ratio=0.7, shuffle=True):
    data = list(data)
    if shuffle:
        random.shuffle(data)
    n_train = int(len(data) * train_ratio)
    train_data = data[:n_train]
    test_data = data[n_train:]
    return train_data, test_data

train_test_ratio = 0.9
train, test = split_data(zip(sents_en_in, sents_fra_in), train_test_ratio)
train, vali = split_data(train, train_test_ratio)

In [6]:
# make vocabulary
from collections import Counter

# 영어(인코더 입력)에 대한 vocab, 프랑스어(디코더 입력)에 대한 vocab
en_token_cnt = Counter()
fr_token_cnt = Counter()

for tokens, _ in train:
    en_token_cnt.update(tokens)

min_count = 2
en_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in en_token_cnt.items():
    if count > min_count and token not in en_vocab:
        en_vocab[token] = len(en_vocab)


for _, tokens in train:
    fr_token_cnt.update(tokens)

fr_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in fr_token_cnt.items():
    if count > min_count and token not in fr_vocab:
        fr_vocab[token] = len(fr_vocab)

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [8]:
class EnToFrDataset(Dataset):
    def __init__(self, data, en_vocab, fr_vocab):
        self.enc_input = []     # 인코더의 입력 영어 데이터
        self.dec_input = []     # 디코더의 입력 프랑스어 데이터
        self.dec_target = []
        for en_sent, fr_sent_in in data:
            self.enc_input.append(en_sent)
            self.dec_input.append(fr_sent_in)
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab
        self.max_len = 30       # 데이터가 가질 수 있는 최대 길이

    def __len__(self):
        return len(self.enc_input)

    def __getitem__(self, idx):
        # 인코더와 디코더에 들어갈 샘플
        # source, target에 대한 index sequence
        src_sample = [self.en_vocab.get(w, self.en_vocab.get("[UNK]")) for w in self.enc_input[idx]]
        # print(src_sample)
        trg_sample = [self.fr_vocab.get(w, self.fr_vocab.get("[UNK]")) for w in self.dec_input[idx]]
        # print(trg_sample)
        # truncate and padding
        src_sample = src_sample[:self.max_len]
        trg_sample = trg_sample[:self.max_len]
        src_sample += [self.en_vocab.get("[PAD]")] * (self.max_len - len(src_sample))
        trg_sample += [self.fr_vocab.get("[PAD]")] * (self.max_len - len(trg_sample))

        # dictonary 형태로, 같은 key를 가진 것끼리 batch가 만들어진다.
        return {"src": torch.LongTensor(src_sample), "trg": torch.LongTensor(trg_sample)}

In [119]:
train_dataset = EnToFrDataset(train, en_vocab, fr_vocab)
vali_dataset = EnToFrDataset(vali, en_vocab, fr_vocab)
test_dataset = EnToFrDataset(test, en_vocab, fr_vocab)

train_loader = DataLoader(train_dataset, batch_size=64,drop_last=True, shuffle=True, num_workers=8)
vali_loader = DataLoader(vali_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)

## 0. 실습 때 사용한 Seq2Seq2

In [10]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    # print(outputs.size())
    # print(hidden.size())
    # print(cell.size())
    return outputs, hidden, cell

In [11]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        # 인코더의 hidden, cell state를 디코더의 LSTM 입력으로
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [12]:
# decoder with simple dot product attention
class AttentionDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim*2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        # unsqueeze : 토큰이 하나씩 들어가기 때문에 차원을 맞추기 위한 연산
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # print(output.size())
        # print(encoder_outputs.size())

        attention_score = torch.bmm(output.squeeze(0).unsqueeze(1), encoder_outputs.permute(1, 2, 0)).squeeze(1)
        attention_distribution = torch.softmax(attention_score, dim=1)
        context = torch.bmm(attention_distribution.unsqueeze(1), encoder_outputs.permute(1, 0, 2)).squeeze(1)
        prediction = self.fc_out(torch.cat((output.squeeze(0), context), dim=1))
        # print(attention_score.size())
        # print(attention_distribution.size())
        # print(context.size())
        # print(prediction.size())

        return prediction, hidden, cell

In [13]:
from typing import Any
import lightning as pl

class Seq2Seq(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.criterian = nn.CrossEntropyLoss(ignore_index=0)
        self.automatic_optimization = False
        # 인코더, 디코더 따로 있기 때문에 auto_optimizer를 사용할 수 없다.
        self.save_hyperparameters()

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tar_len, batch_size, trg_vocab_size 만큼의 공간
        # 디코더의 출력을 저장하는 공간
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(trg.device)

        enc_output, hidden, cell = self.encoder(src)
        # print(enc_output.size())
        # print(hidden.size())
        # print(cell.size())
        
        # 하나의 token씩 입력을 넣어준다.
        input = trg[0,:]
        # print(input)
        
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)
                # print(output.size())
                # print(hidden.size())
                # print(cell.size())
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

    def training_step(self, batch, batch_idx):
        enc_opt, dec_opt = self.optimizers()

        enc_opt.zero_grad()
        dec_opt.zero_grad()

        src = batch["src"].permute(1, 0)    # LSTM에는 batch가 두번째로!
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.manual_backward(loss)
        enc_opt.step()
        dec_opt.step()

        self.log("train_loss", loss)
        self.log("train_PPL", math.exp(loss))   # PPL : Launguae Generation 할 때의 성능지표!
        return loss

    def validation_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("val_loss", loss)
        self.log("val_PPL", math.exp(loss))
        return loss

    def test_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("test_loss", loss)
        self.log("test_PPL", math.exp(loss))
        return loss

    # 디코더의 실제 출력을 확인
    def decode(self, src):
        enc_output, hidden, cell = self.encoder(src.unsqueeze(1))
        trg_len = 30
        trg_vocab_size = self.decoder.output_dim
        outputs = [2]
        input = torch.LongTensor([2]).to(src.device)
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)
            top1 = output.argmax(1)
            outputs.append(top1.item())
            if top1.item() == 3:
                break
            input = top1
        return outputs

    def configure_optimizers(self):
        enc_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=1e-4)
        dec_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=1e-4)
        return enc_optimizer, dec_optimizer

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
emb_dim = 256
hid_dim = 512
n_layers = 2

In [15]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

att_decoder = AttentionDecoder(output_dim=len(fr_vocab),
                            emb_dim=emb_dim,
                            hid_dim=hid_dim,
                            n_layers=n_layers,
                            dropout=0.5)

model = Seq2Seq(encoder, decoder)
att_model = Seq2Seq(encoder, att_decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [16]:
import wandb
from lightning.pytorch.loggers import WandbLogger

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnoeyhesx[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [17]:
wandb_logger = WandbLogger(project="NLP", name="basic_Seq2Seq", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
# trainer.fit(model, train_loader, vali_loader)

# wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 12.8 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
18.6 M    Trainable params
0         Non-trainable params
18.6 M    Total params
74.448    Total estimated model params size (MB)


Epoch 9: 100%|██████████| 2945/2945 [03:40<00:00, 13.34it/s, v_num=lj1h]   

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 2945/2945 [03:41<00:00, 13.32it/s, v_num=lj1h]


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train_PPL,█▇▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,██▇▆▆▅▅▅▄▅▄▄▄▅▄▃▃▄▃▂▃▄▃▃▂▂▂▂▂▃▂▁▂▂▂▂▂▁▃▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,█▅▃▃▂▂▂▁▁▁
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,9.0
train_PPL,11.32534
train_loss,2.42704
trainer/global_step,29449.0
val_PPL,20.69061
val_loss,3.01041


## 1. ENCODER 개선 (30점)

* 실습수업에 사용한 Seq-to-Seq 모델의 Encoder를 개선하시오.
    * 합리적 이유에 기반해 개선 방법을 찾고 구현 및 실험 하시오
        * 여러 제약사항(컴퓨팅, 메모리 등)이 있으므로 꼭 성능이 높아져야 하는 것은 아님
    * 왜 그런 모델 구성을 생각했는지, 그 결과가 어떻게 나타났는지 기술하시오
        * 성능이 높아졌다면 왜 그렇다고 생각하는지, 낮아졌다면 무엇이 문제인 것 같은지

    * Hint
        * 꼭 Encoder의 구조가 RNN 계열의 모델이어야 하는가?
        * Bi-directional RNN을 사용한다면?
        * ...

**GRADING**
* 적용한 방법 1개당 (+15)

### (1) Encoder 개선 - BiLSTM Encoder

In [67]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout, bidirectional=True)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    # print(outputs.size())
    # print(hidden.size())
    # print(cell.size())
    return outputs, hidden, cell

In [66]:
encoder = Encoder(input_dim=len(en_vocab),
                emb_dim=emb_dim,
                hid_dim=hid_dim,
                n_layers=n_layers,
                dropout=0.5)

for batch in train_loader:
    src = batch["src"].permute(1, 0)
    encoder.forward(src=src)
    break

torch.Size([30, 64, 1024])
torch.Size([4, 64, 512])
torch.Size([4, 64, 512])


- Encoder의 LSTM 모델을 Bi-directional LSTM으로 변경
- 양방향 문맥을 파악하면 더 효과적일 것이라 생각!

In [71]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        # biLSTM으로 인해 hidden_state output이 2배가 되어 num_layers도 2배로 설정
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers*2, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # print(input.size())
        input = input.unsqueeze(0)
        # print(input.size())
        embedded = self.dropout(self.embedding(input))
        # 인코더의 hidden, cell state를 디코더의 LSTM 입력으로
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        # print(prediction.size())
        return prediction, hidden, cell

In [72]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

for batch in train_loader:
    src = batch["src"].permute(1, 0)
    trg = batch["trg"].permute(1, 0)
    enc_output, hidden, cell = encoder(src)
    decoder.forward(input=trg[0], hidden=hidden, cell=cell)
    break

### (1) Encoder 개선 Test - BiLSTM Encoder

In [50]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

model = Seq2Seq(encoder, decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [51]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_biLSTM_enc", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [52]:
trainer.fit(model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 11.6 M
1 | decoder   | Decoder          | 17.0 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
28.6 M    Trainable params
0         Non-trainable params
28.6 M    Total params
114.333   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [05:20<00:00,  9.19it/s, v_num=0z0r]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [05:21<00:00,  9.17it/s, v_num=0z0r]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▅▅▄▃▄▃▃▃▃▂▃▂▃▂▂▃▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁▂▁▁▁▁▁▁
train_loss,█▆▆▆▅▅▅▅▅▄▄▄▃▄▄▄▄▃▃▄▃▃▂▃▃▃▂▃▂▂▁▂▂▃▁▂▂▂▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,59.07238
train_loss,4.07876
trainer/global_step,2944.0
val_PPL,59.90458
val_loss,4.08314


### (2) Encoder 개선 - Encoder의 입력을 반전시키자!

In [120]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # print(src)
    src = torch.stack([seq.flip(0) for seq in src])
    # print(src)
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return outputs, hidden, cell

In [121]:
encoder = Encoder(input_dim=len(en_vocab),
                emb_dim=emb_dim,
                hid_dim=hid_dim,
                n_layers=n_layers,
                dropout=0.5)

for batch in train_loader:
    src = batch["src"].permute(1, 0)
    encoder.forward(src=src)
    break

tensor([[ 165,  447,  178,  ...,   31,  742,   33],
        [ 287,  278,   33,  ...,   44,  151,   44],
        [  14,   12, 1959,  ...,   14,   12,  139],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]])
tensor([[  33,  742,   31,  ...,  178,  447,  165],
        [  44,  151,   44,  ...,   33,  278,  287],
        [ 139,   12,   14,  ..., 1959,   12,   14],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]])


### (2) Encoder 개선 Test - Encoder의 입력을 반전시키자!

In [61]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

model = Seq2Seq(encoder, decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [62]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_reverse_input", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [63]:
trainer.fit(model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 12.8 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
18.6 M    Trainable params
0         Non-trainable params
18.6 M    Total params
74.479    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [03:43<00:00, 13.20it/s, v_num=n13o]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [03:43<00:00, 13.18it/s, v_num=n13o]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▄▅▄▃▃▃▂▂▃▂▂▃▂▃▁▂▂▂▂▃▂▂▂▂▁▂▃▁▂▂▁▁▁▁▃▂▂▂▁
train_loss,█▅▆▅▄▄▄▃▂▃▃▃▃▃▄▂▃▂▃▃▃▃▃▂▃▁▃▄▁▂▂▁▂▁▁▄▃▂▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,130.14435
train_loss,4.86864
trainer/global_step,2944.0
val_PPL,178.80872
val_loss,5.18042


## 2. DECODER 개선 (30점)

* 실습수업에 사용한 Seq-to-Seq 모델의 Decoder를 개선하시오.
    * 합리적 이유에 기반해 개선 방법을 찾고 구현 및 실험 하시오
        * 여러 제약사항(컴퓨팅, 메모리 등)이 있으므로 꼭 성능이 높아져야 하는 것은 아님
    * 왜 그런 모델 구성을 생각했는지, 그 결과가 어떻게 나타났는지 기술하시오
        * 성능이 높아졌다면 왜 그렇다고 생각하는지, 낮아졌다면 무엇이 문제인 것 같은지

    * Hint
        * 최종 output을 만들 떄 마지막 layer의 hidden vector만 사용하는게 최선인가? 이전 layer의 hidden vector도 같이 사용한다면?
        * 왜 encoder와 decoder의 크기 차이가 많이 발생하는가? 이를 해결할 수 없는가?
        * 현재 Encoder의 마지막 hidden vector를 사용하는데 대부분 [PAD] 토큰이다. [PAD] 토큰의 hidden vector를 사용하는게 맞는가?
        * ...

**GRADING**
* 적용한 방법 1개당 (+15)

### (1) Decoder 개선 - Decoder의 크기를 줄여보자!

- Encoder와 Decoder의 모델 크기를 확인해보면 각각 `5.8M` `12.8M` 으로 꽤 차이나 난다.
- 두개의 구조 상 결정적인 차이는 출력층에서 Fully Conntected Layer를 사용한다는 것이다.
- Decoder의 Embedding, Linear가 같은 파라미터를 공유하면 모델 크기를 줄일 수 있지 않을까?


In [18]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return outputs, hidden, cell

In [19]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        # 차원을 맞추기 위한 Projection Layer
        self.projection = nn.Linear(hid_dim, emb_dim)

        self.fc_out = nn.Linear(emb_dim, output_dim, bias=False)
        self.fc_out.weight = self.embedding.weight

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # print(output.size())
        output = self.projection(output.squeeze(0))
        # print(output.size())
        prediction = self.fc_out(output)
        # print(prediction.size())
        return prediction, hidden, cell

In [20]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                 emb_dim=emb_dim,
                 hid_dim=hid_dim,
                 n_layers=n_layers,
                 dropout=0.5)

for batch in train_loader:
    src = batch["src"].permute(1, 0)
    trg = batch["trg"].permute(1, 0)
    enc_output, hidden, cell = encoder(src)
    decoder.forward(input=trg[0], hidden=hidden, cell=cell)
    break

### (1) Decoder 개선 Test - Decoder의 크기를 줄여보자!

In [21]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

model = Seq2Seq(encoder, decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [22]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_decoder_weight_tying", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [23]:
trainer.fit(model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 6.8 M 
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
12.7 M    Trainable params
0         Non-trainable params
12.7 M    Total params
50.635    Total estimated model params size (MB)


Epoch 9: 100%|██████████| 2945/2945 [03:25<00:00, 14.35it/s, v_num=p2el]   

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 2945/2945 [03:25<00:00, 14.34it/s, v_num=p2el]


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train_PPL,█▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▇▆▆▅▄▄▄▄▃▃▃▃▃▃▃▂▂▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▁▂▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,█▄▃▂▂▂▁▁▁▁
val_loss,█▅▄▄▃▂▂▂▁▁

0,1
epoch,9.0
train_PPL,14.53211
train_loss,2.67636
trainer/global_step,29449.0
val_PPL,27.54312
val_loss,3.29282


- 가중치 공유를 통해 `5.8M` `6.9M` 으로 Encoder Decoder의 모델 크기 차이를 크게 감소시켰다.
- 학습 과정에서 Loss와 PPL은 조금 더 높게 나오긴 했지만 비슷한 수치이며 모델을 경량화하면서 비슷한 성능을 보인다는 것은 유의미한 결과라고 생각한다.

### (2) Decoder 개선 - [PAD] 토큰을 무시하자!
- PAD 토큰의 역할은 가변 길이 시퀀스를 고정 길이 matrix로 만들어 병렬 처리가 가능하도록 해준다.
- 하지만, PAD 토큰으로 인해 Encoder의 마지막 hidden vector는 대부분 [PAD] 토큰이다.
- 따라서, Encoder에서 [PAD] 토큰에 마스킹을 해서 의미없는 토큰인 PAD 토큰을 무시하도록 한다.

In [50]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    mask = (src == 0)
    lengths = mask.logical_not().sum(dim=0)

    masked = pack_padded_sequence(embedded, lengths.cpu(), batch_first=False, enforce_sorted=False)
    outputs, (hidden, cell) = self.rnn(masked)

    outputs, _ = pad_packed_sequence(outputs, batch_first=False)
    
    return outputs, hidden, cell

In [51]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [52]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

for batch in train_loader:
    src = batch["src"].permute(1, 0)
    trg = batch["trg"].permute(1, 0)
    output, hidden, cell = encoder.forward(src=src)
    decoder.forward(trg[0,:], hidden, cell)
    break


### (2) Decoder 개선 Test - [PAD] 토큰을 무시하자!

In [53]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

model = Seq2Seq(encoder, decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [54]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_masked_pad_token", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [55]:
trainer.fit(model, train_loader, vali_loader)

wandb.finish()

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 12.8 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
18.6 M    Trainable params
0         Non-trainable params
18.6 M    Total params
74.394    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [03:29<00:00, 14.05it/s, v_num=dyxd]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [03:29<00:00, 14.03it/s, v_num=dyxd]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▅▅▅▅▄▄▄▄▃▄▃▃▃▄▂▃▃▃▃▂▂▃▂▂▂▂▃▂▂▂▂▂▂▁▂▁▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,60.19929
train_loss,4.09766
trainer/global_step,2944.0
val_PPL,67.88832
val_loss,4.20872


## 3. ENC-DEC 개선 (40점)

* 실습수업에 사용한 Seq-to-Seq 모델의 Encoder-Decoder의 연결부분을 개선하시오.
    * 합리적 이유에 기반해 개선 방법을 찾고 구현 및 실험 하시오
        * 여러 제약사항(컴퓨팅, 메모리 등)이 있으므로 꼭 성능이 높아져야 하는 것은 아님
    * 왜 그런 모델 구성을 생각했는지, 그 결과가 어떻게 나타났는지 기술하시오
        * 성능이 높아졌다면 왜 그렇다고 생각하는지, 낮아졌다면 무엇이 문제인 것 같은지

    * Hint
        * Attention을 개선할 수 없을까? (Dot attention을 QKV attention으로 개선, weighted attention 등)
        * Enc-DEC의 layer 수가 다른 경우는 어떻게 처리할 것인가?
        * ...


**GRADING**
* 적용한 방법 1개당 (+15) (최대 40점)

### (1) ENC-DEC 개선 - QKV attention
- 기존 단순한 Dot attention을 QKV attention으로 개선해본다.
- query는 Decoder의 output, key&value는 Encoder outputs
- 각각 Projection 연산 후에 attention score를 계산한다.

In [20]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return outputs, hidden, cell

In [21]:
class AttentionDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.Wq = nn.Linear(hid_dim, hid_dim)
        self.Wk = nn.Linear(hid_dim, hid_dim)
        self.Wv = nn.Linear(hid_dim, hid_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim*2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        query = self.Wq(output)
        key = self.Wk(encoder_outputs)
        value = self.Wv(encoder_outputs)

        attention_score = torch.bmm(query.squeeze(0).unsqueeze(1), key.permute(1, 2, 0)).squeeze(1)
        attention_distribution = torch.softmax(attention_score, dim=1)
        context = torch.bmm(attention_distribution.unsqueeze(1), value.permute(1, 0, 2)).squeeze(1)
        
        prediction = self.fc_out(torch.cat((output.squeeze(0), context), dim=1))

        return prediction, hidden, cell

### (1) ENC-DEC 개선 Test - QKV attention

In [22]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

att_decoder = AttentionDecoder(output_dim=len(fr_vocab),
                            emb_dim=emb_dim,
                            hid_dim=hid_dim,
                            n_layers=n_layers,
                            dropout=0.5)

att_model = Seq2Seq(encoder, att_decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [23]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_QKV_att", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(att_model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | AttentionDecoder | 19.6 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
25.5 M    Trainable params
0         Non-trainable params
25.5 M    Total params
101.815   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [06:04<00:00,  8.07it/s, v_num=yk8q]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [06:05<00:00,  8.06it/s, v_num=yk8q]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▅▆▅▄▄▅▅▄▅▄▄▅▄▄▄▃▂▃▂▂▂▃▃▂▂▂▃▂▂▂▂▂▂▂▂▁▁▁▂
train_loss,█▆▇▆▆▆▇▆▆▆▆▆▇▅▅▅▅▄▄▄▃▄▄▅▃▃▄▄▃▃▄▃▃▃▃▃▂▁▁▃
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,78.2801
train_loss,4.36029
trainer/global_step,2944.0
val_PPL,76.56445
val_loss,4.32845


### (2) Enc-Dec 개선 - Multi-Head Attention
- Multi-Head attention의 이점은 다음과 같다.
    - 입력 시퀀스의 다양한 표현을 병렬로 학습할 수 있다.
    - 그로 인해 표현력이 높아질 수 있다.
- `nn.MultiheadAttention` 모듈을 활용해 이를 적용해본다.

In [25]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)

        self.dense = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)

    def forward(self, query, key, value):
        batch_size = query.size(0)

        query = self.Wq(query)
        key = self.Wk(key)
        value = self.Wv(value)
        
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        attn = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(self.depth)
        attn = torch.nn.functional.softmax(attn, dim=-1)
        out = torch.matmul(attn, value)
        
        out = out.permute(0, 2, 1, 3).contiguous()
        out = out.view(batch_size, -1, self.d_model)

        return self.dense(out)

In [34]:
class AttentionDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.mha = nn.MultiheadAttention(embed_dim=hid_dim, num_heads=8, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim*2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        att_output, _ = self.mha(output, encoder_outputs, encoder_outputs)
        outputs = torch.cat((output.squeeze(1), att_output.squeeze(1)), dim=2)

        prediction = self.fc_out(self.dropout(outputs)).squeeze(0)

        return prediction, hidden, cell

### (2) Enc-Dec 개선 Test - Multi-Head Attention

In [35]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

att_decoder = AttentionDecoder(output_dim=len(fr_vocab),
                               emb_dim=emb_dim,
                               hid_dim=hid_dim,
                               n_layers=n_layers,
                               dropout=0.5)

att_model = Seq2Seq(encoder, att_decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [36]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_multi_head_att", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [37]:
trainer.fit(att_model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | AttentionDecoder | 19.9 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
25.7 M    Trainable params
0         Non-trainable params
25.7 M    Total params
102.865   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [06:18<00:00,  7.77it/s, v_num=jnps]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [06:19<00:00,  7.77it/s, v_num=jnps]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▆▆▅▄▅▄▄▄▄▃▃▃▃▃▃▂▂▂▃▂▂▂▂▂▃▂▂▂▁▂▂▂▁▂▂▁▁▁▁
train_loss,█▇▇▆▆▇▆▅▅▅▅▅▅▄▄▄▄▃▄▄▃▃▄▄▃▄▃▃▃▁▂▃▃▂▃▃▂▁▂▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,43.44275
train_loss,3.77144
trainer/global_step,2944.0
val_PPL,62.35681
val_loss,4.12248


### (3) Enc-Dec 개선 - Encoder와 Decoder의 Layer 수가 다를땐?
- 보통 Encoder와 Decoder의 레이어 수를 같게 하지만 다른 경우 어떻게 처리할 것인지에 대해 고민해본다.

In [18]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return outputs, hidden, cell

In [20]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [23]:
from typing import Any
import lightning as pl

class Seq2Seq(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.criterian = nn.CrossEntropyLoss(ignore_index=0)
        self.automatic_optimization = False
        self.save_hyperparameters()

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(trg.device)

        enc_output, hidden, cell = self.encoder(src)
        # print(hidden.size())
        # print(cell.size())
        
        if self.encoder.rnn.num_layers > self.decoder.rnn.num_layers:
            hidden = hidden[-self.decoder.rnn.num_layers:]
            cell = cell[-self.decoder.rnn.num_layers:]   
        else:
            diff = (self.decoder.rnn.num_layers // self.encoder.rnn.num_layers)
            hidden = torch.cat([hidden] * diff, dim=0)
            cell = torch.cat([cell] * diff, dim=0)
        # print(hidden.size())
        # print(cell.size())

        input = trg[0,:]
        
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)

            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

    def training_step(self, batch, batch_idx):
        enc_opt, dec_opt = self.optimizers()

        enc_opt.zero_grad()
        dec_opt.zero_grad()

        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.manual_backward(loss)
        enc_opt.step()
        dec_opt.step()

        self.log("train_loss", loss)
        self.log("train_PPL", math.exp(loss))
        return loss

    def validation_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("val_loss", loss)
        self.log("val_PPL", math.exp(loss))
        return loss

    def test_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("test_loss", loss)
        self.log("test_PPL", math.exp(loss))
        return loss
    
    def decode(self, src):
        enc_output, hidden, cell = self.encoder(src.unsqueeze(1))
        trg_len = 30
        trg_vocab_size = self.decoder.output_dim
        outputs = [2]
        input = torch.LongTensor([2]).to(src.device)
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)
            top1 = output.argmax(1)
            outputs.append(top1.item())
            if top1.item() == 3:
                break
            input = top1
        return outputs

    def configure_optimizers(self):
        enc_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=1e-4)
        dec_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=1e-4)
        return enc_optimizer, dec_optimizer

In [27]:
emb_dim = 256
hid_dim = 512
enc_layers = 2
dec_layers = 4

encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=enc_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=dec_layers,
                  dropout=0.5)

model = Seq2Seq(encoder, decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [66]:
for batch in train_loader:
    src = batch["src"].permute(1,0)
    trg = batch["trg"].permute(1,0)
    model.forward(src=src, trg=trg)
    break

torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([4, 64, 512])
torch.Size([4, 64, 512])


In [29]:
wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_enc2_dec4", group="HW03")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model, train_loader, vali_loader)

wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 17.0 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
22.8 M    Trainable params
0         Non-trainable params
22.8 M    Total params
91.063    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [04:40<00:00, 10.50it/s, v_num=511f]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [04:40<00:00, 10.49it/s, v_num=511f]


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,█▄▄▃▅▃▄▄▄▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▂▂▁▁▂▁▂▁
train_loss,█▆▅▅▆▅▅▅▅▄▅▅▄▃▄▅▄▃▃▃▂▃▃▃▄▂▂▃▂▃▁▂▂▂▂▁▃▁▂▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,82.6241
train_loss,4.4143
trainer/global_step,2944.0
val_PPL,94.55687
val_loss,4.54029
