In [None]:
# !pip install lightning wandb

In [1]:
import os
import re
import shutil
import zipfile
import numpy as np
import unicodedata
import urllib3
import random
import math

In [2]:
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        print(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
output_path = "fra-eng.zip"

if not os.path.exists(output_path):
    download_zip(url, output_path)

    path = os.getcwd()
    zipfilename = os.path.join(path, output_path)

    with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
        zip_ref.extractall(path)

In [3]:
def to_ascii(s):
  # 프랑스어 악센트(accent) 삭제
  # 예시 : 'déjà diné' -> deja dine
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  # 악센트 제거 함수 호출
  sent = to_ascii(sent.lower())

  # 단어와 구두점 사이에 공백 추가.
  # ex) "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환.
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # 다수 개의 공백을 하나의 공백으로 치환
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [4]:
# 전처리 테스트
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 :', en_sent)
print('전처리 후 영어 문장 :',preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장 :', fr_sent)
print('전처리 후 프랑스어 문장 :', preprocess_sentence(fr_sent))

전처리 전 영어 문장 : Have you had dinner?
전처리 후 영어 문장 : have you had dinner ?
전처리 전 프랑스어 문장 : Avez-vous déjà diné?
전처리 후 프랑스어 문장 : avez vous deja dine ?


In [5]:
def load_preprocessed_data():
  encoder_input, decoder_input = [], []

  with open("fra.txt", "r") as lines:
    for i, line in enumerate(lines):
      # source 데이터와 target 데이터 분리
      src_line, tar_line, _ = line.strip().split('\t')

      # source 데이터 전처리
      src_line = [w for w in preprocess_sentence(src_line).split()]

      # target 데이터 전처리
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("[SOS] " + tar_line + " [EOS]").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)

  return encoder_input, decoder_input

In [6]:
sents_en_in, sents_fra_in = load_preprocessed_data()
print('인코더의 입력 :',sents_en_in[-1:])
print('디코더의 입력 :',sents_fra_in[-1:])

인코더의 입력 : [['i', 'went', 'drinking', 'with', 'one', 'of', 'my', 'boyfriend', 's', 'friends', 'and', 'now', 'he', 's', 'furious', 'at', 'me', '.', 'was', 'this', 'friend', 'a', 'guy', 'or', 'a', 'girl', '?', 'a', 'guy', 'obviously', '.', 'why', 'would', 'i', 'go', 'drinking', 'with', 'his', 'female', 'friends', '?', 'yeah', 'you', 're', 'right', '.', 'his', 'name', 'is', 'tom', '.', 'he', 's', 'really', 'hot', 'and', 'i', 'really', 'want', 'to', 'go', 'drinking', 'with', 'him', 'again', '.']]
디코더의 입력 : [['[SOS]', 'je', 'suis', 'allee', 'boire', 'avec', 'un', 'ami', 'de', 'mon', 'compagnon', 'et', 'voila', 'qu', 'il', 'est', 'furieux', 'contre', 'moi', '.', 'etait', 'ce', 'un', 'gars', 'ou', 'une', 'fille', '?', 'un', 'gars', 'bien', 'evidemment', '.', 'pourquoi', 'irais', 'je', 'boire', 'avec', 'ses', 'amies', '?', 'ouais', 'ca', 'se', 'comprend', '.', 'il', 's', 'appelle', 'tom', '.', 'il', 'est', 'trop', 'canon', 'et', 'j', 'ai', 'tellement', 'envie', 'd', 'aller', 'prendre', 'un', 'v

In [7]:
#split data
def split_data(data, train_ratio=0.7, shuffle=True):
    data = list(data)
    if shuffle:
        random.shuffle(data)
    n_train = int(len(data) * train_ratio)
    train_data = data[:n_train]
    test_data = data[n_train:]
    return train_data, test_data

train_test_ratio = 0.9
train, test = split_data(zip(sents_en_in, sents_fra_in), train_test_ratio)
train, vali = split_data(train, train_test_ratio)

In [8]:
len(train), len(vali), len(test)

(188515, 20947, 23274)

In [9]:
from collections import Counter

en_token_cnt = Counter()
fr_token_cnt = Counter()

for tokens, _ in train:
    en_token_cnt.update(tokens)

min_count = 2
en_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in en_token_cnt.items():
    if count > min_count and token not in en_vocab:
        en_vocab[token] = len(en_vocab)


for _, tokens in train:
    fr_token_cnt.update(tokens)

fr_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in fr_token_cnt.items():
    if count > min_count and token not in fr_vocab:
        fr_vocab[token] = len(fr_vocab)

In [10]:
len(en_vocab), len(fr_vocab)

(8295, 11878)

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [12]:
class EnToFrDataset(Dataset):
    def __init__(self, data, en_vocab, fr_vocab):
        self.enc_input = []
        self.dec_input = []
        for en_sent, fr_sent_in in data:
            self.enc_input.append(en_sent)
            self.dec_input.append(fr_sent_in)
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab
        self.max_len = 30

    def __len__(self):
        return len(self.enc_input)

    def __getitem__(self, idx):
        src_sample = [self.en_vocab.get(w, self.en_vocab.get("[UNK]")) for w in self.enc_input[idx]]
        trg_sample = [self.fr_vocab.get(w, self.fr_vocab.get("[UNK]")) for w in self.dec_input[idx]]
        src_sample = src_sample[:self.max_len]
        trg_sample = trg_sample[:self.max_len]
        src_sample += [self.en_vocab.get("[PAD]")] * (self.max_len - len(src_sample))
        trg_sample += [self.fr_vocab.get("[PAD]")] * (self.max_len - len(trg_sample))

        return {"src": torch.LongTensor(src_sample), "trg": torch.LongTensor(trg_sample)}

In [13]:
train_dataset = EnToFrDataset(train, en_vocab, fr_vocab)
vali_dataset = EnToFrDataset(vali, en_vocab, fr_vocab)
test_dataset = EnToFrDataset(test, en_vocab, fr_vocab)

train_loader = DataLoader(train_dataset, batch_size=64,drop_last=True, shuffle=True, num_workers=8)
vali_loader = DataLoader(vali_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)

In [14]:
train_dataset.__getitem__(10120)

{'src': tensor([  20,  187,   33, 4260,   51,   68,   24,    1,   13,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]),
 'trg': tensor([   2,   21,  451,  175,   27, 5705,  143,   27, 3456,   15,    3,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0])}

In [15]:
class MultiHeadAttention(nn.Module):
    # d_model : hidden size의 크기, num_heads : multi-head attention의 head 수
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        # hidden dimension이 num_heads와 나누어 떨어지는지 check!
        assert d_model % self.num_heads == 0
        # head 별 hidden size 계산
        self.depth = d_model // self.num_heads

        # projection linear layer
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)

        # 최종 output을 위한 dense layer
        self.dense = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)

    # mask : decoder는 masking 기법을 사용하기 때문에!
    # mask : attention에서 PAD 토큰에 대한 masking!
    def forward(self, q, k, v, mask):
        batch_size = q.size(0)

        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)
        # [2, 5, 128] : 2개의 batch, 5의 sequence 길이, 128개의 hidden sie
        # print(q.size())
        # print(k.size())
        # print(v.size())
        # print('#' * 50)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        # [2, 8, 5, 16] : 2개의 batch, 8개의 head, 5의 sequence 길이, 16개의 hidden size
        # print(q.size())
        # print(k.size())
        # print(v.size())

        # attention score
        attn = torch.matmul(q, k.permute(0, 1, 3, 2)) / math.sqrt(self.depth)
        # mask를 통해 PAD 토큰 연산 X
        # mask 되어야 하는 부분은 0, 아닌 것은 1에 -1e9 를 채워넣음!
        attn = attn.masked_fill(mask.unsqueeze(1) == 0, -1e9)
        # print(attn[0][0])
        attn = torch.nn.functional.softmax(attn, dim=-1)
        # 과제는 이것을 밖으로 내보내서 Headmap!!!!!!!
        
        # mask 된 부분은 softmax를 취하면 0이 되길 기대한다!
        # print(attn[0][0])
        out = torch.matmul(attn, v)
        # [2, 4, 6, 32] : 2개의 batch, 4개의 head, 6의 sequence 길이, 32의 hidden size (128//4)
        # print(out.size())
        
        out = out.permute(0, 2, 1, 3).contiguous()
        out = out.view(batch_size, -1, self.d_model)

        return self.dense(out)

In [16]:
class TransformerEncodeLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout_rate):
        super(TransformerEncodeLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x, padding_mask):
        attn_output = self.mha(x, x, x, padding_mask)
        attn_output = self.dropout1(attn_output)
        # x + attn_output : residual connection!
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [17]:
class TrasnformerDecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout_rate):
        super(TrasnformerDecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.dropout3 = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        # self-attention, look_ahead_mask (뒷부분의 정보를 지우는 mask!)
        attn1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1)
        out1 = self.layernorm1(x + attn1)

        attn2 = self.mha2(out1, enc_output, enc_output, padding_mask)
        attn2 = self.dropout2(attn2)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

In [18]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(TransformerEncoder, self).__init__()
        self.enc_layers = nn.ModuleList([TransformerEncodeLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, padding_mask):
        for enc_layer in self.enc_layers:
            x = enc_layer(x, padding_mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(TransformerDecoder, self).__init__()
        self.dec_layers = nn.ModuleList([TrasnformerDecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        for dec_layer in self.dec_layers:
            x = dec_layer(x, enc_output, look_ahead_mask, padding_mask)
        return x

In [19]:
class Transformer(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate, en_vocab_size, fr_vocab_size):
        super(Transformer, self).__init__()
        self.output_dim = fr_vocab_size
        self.en_Embedding = nn.Embedding(en_vocab_size, d_model)
        self.fr_Embedding = nn.Embedding(fr_vocab_size, d_model)

        self.encoder = TransformerEncoder(num_layers, d_model, num_heads, dff, dropout_rate)
        self.decoder = TransformerDecoder(num_layers, d_model, num_heads, dff, dropout_rate)
        self.final_layer = nn.Linear(d_model, fr_vocab_size)

    def encode(self, enc_input, enc_padding_mask):
        return self.encoder(self.en_Embedding(enc_input), enc_padding_mask)

    def decode(self, dec_input, enc_output, look_ahead_mask, dec_padding_mask):
        return self.decoder(self.fr_Embedding(dec_input), enc_output, look_ahead_mask, dec_padding_mask)

    def forward(self, enc_input, dec_input, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encode(enc_input, enc_padding_mask)
        dec_output = self.decode(dec_input, enc_output, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output

In [20]:
def make_pad_mask(query, key, pad_idx=0):
    # mask는 batch x head size x sequence length x hidden size 만큼의 mask!
    query_seq_len, key_seq_len = query.size(1), key.size(1)

    key_mask = key.ne(pad_idx).unsqueeze(1)
    # print(key_mask)
    key_mask = key_mask.repeat(1, query_seq_len, 1)
    # print(key_mask)

    query_mask = query.ne(pad_idx).unsqueeze(2)
    # print(query_mask)
    query_mask = query_mask.repeat(1,  1, key_seq_len)
    # print(query_mask)

    # 가로축에 대한 mask (query mask), 세로축에 대한 mask (key mask) 를 합치는 연산!
    mask = key_mask & query_mask
    # print(mask)
    mask.requires_grad = False
    return mask

In [21]:
num_layers = 2
d_model = 128
num_heads = 1
dff = 512
dropout_rate = 0.1
en_vocab_size = len(en_vocab)
fr_vocab_size = len(fr_vocab)

model = Transformer(num_layers, d_model, num_heads, dff, dropout_rate, en_vocab_size, fr_vocab_size)

enc_input = torch.LongTensor([[1, 2, 3, 0, 0], [6, 7, 8, 9, 10]])
dec_input = torch.LongTensor([[1, 2, 3, 4, 5, 6], [6, 7, 8, 9, 10,11]])

enc_padding_mask = make_pad_mask(enc_input, enc_input)
dec_padding_mask = make_pad_mask(dec_input, enc_input)

# decoder에 들어가는 미래 정보를 지워주는 mask!
# triu : upper triangle을 만드는 pytorch 연산!
# causal_mask : 자주 사용하는 mask! 이러한 마스크를 이용하는 것을 causal LM
look_ahead_mask = ~torch.triu(torch.ones(dec_input.size(1), dec_input.size(1), dtype=torch.bool), diagonal=1).unsqueeze(0)
# print(look_ahead_mask)

model(enc_input, dec_input, enc_padding_mask, look_ahead_mask, dec_padding_mask).size()


torch.Size([2, 6, 11878])

In [22]:
from typing import Any
import lightning as pl

class TransformerPL(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.criterian = nn.CrossEntropyLoss(ignore_index=0)
        self.save_hyperparameters()

    def make_pad_mask(self, query, key, pad_idx=0):
        query_seq_len, key_seq_len = query.size(1), key.size(1)

        key_mask = key.ne(pad_idx).unsqueeze(1)
        key_mask = key_mask.repeat(1, query_seq_len, 1)

        query_mask = query.ne(pad_idx).unsqueeze(2)
        query_mask = query_mask.repeat(1,  1, key_seq_len)

        mask = key_mask & query_mask
        mask.requires_grad = False
        return mask.to(query.device)

    def make_causal_mask(self, query, pad_idx=0):
        seq_len = query.size(1)
        causal_mask = ~torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).unsqueeze(0)
        return causal_mask.to(query.device)

    def forward(self, src, trg):
        enc_padding_mask = self.make_pad_mask(src, src)
        dec_padding_mask = self.make_pad_mask(trg, src)
        look_ahead_mask = self.make_causal_mask(trg)

        outputs = self.model(src, trg, enc_padding_mask, look_ahead_mask, dec_padding_mask)

        return outputs

    def training_step(self, batch, batch_idx):
        src = batch["src"]
        trg = batch["trg"]

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        # loss 계산의 편의를 위해 펼쳐주는 연산
        # outputs[:, :-1] : 한개씩 밀린 것과 비교, 자기 자신과 비교 X
        outputs = outputs[:,:-1].reshape(-1, outputs_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_PPL", math.exp(loss))
        return loss

    def validation_step(self, batch, batch_idx):
        src = batch["src"]
        trg = batch["trg"]

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[:,:-1].reshape(-1, outputs_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("val_loss", loss)
        self.log("val_PPL", math.exp(loss))
        return loss

    def test_step(self, batch, batch_idx):
        src = batch["src"]
        trg = batch["trg"]

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[:,:-1].reshape(-1, outputs_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("test_loss", loss)
        self.log("test_PPL", math.exp(loss))
        return loss

    def decode(self, src):
        enc_output = self.model.encode(src, self.make_pad_mask(src, src))
        trg_len = 30
        outputs = [2]
        input = torch.LongTensor([[2] for _ in range(src.size(0))]).to(src.device)
        for t in range(1, trg_len):
            look_ahead_mask = self.make_causal_mask(input)
            dec_padding_mask = self.make_pad_mask(input, src)
            output = self.model.decode(input, enc_output, look_ahead_mask, dec_padding_mask)
            output = self.model.final_layer(output)
            output = output[:,-1,:]
            top1 = output.argmax(1)
            outputs.append(top1.item())
            if top1.item() == 3:
                break
            input = torch.cat([input, top1.unsqueeze(1)], dim=1)
        return outputs

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)
        return optimizer

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
num_layers = 2
d_model = 128
num_heads = 8
dff = 512
dropout_rate = 0.1
en_vocab_size = len(en_vocab)
fr_vocab_size = len(fr_vocab)

transformer_model = Transformer(num_layers, d_model, num_heads, dff, dropout_rate, en_vocab_size, fr_vocab_size)

In [24]:
model_pl = TransformerPL(transformer_model)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [25]:
# trainingstep test
batch = next(iter(train_loader))
model_pl.training_step(batch, 0)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/core/module.py:436: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


tensor(9.5751, grad_fn=<NllLossBackward0>)

In [26]:
import wandb
from lightning.pytorch.loggers import WandbLogger

wandb.login()

wandb_logger = WandbLogger(project="NLP", name="Transformer", group="Lec06")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnoeyhesx[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [27]:
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
trainer.fit(model_pl, train_loader, vali_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | Transformer      | 5.0 M 
1 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
5.0 M     Trainable params
0         Non-trainable params
5.0 M     Total params
20.160    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [00:43<00:00, 67.58it/s, v_num=28vh, train_loss=3.160]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [00:43<00:00, 67.43it/s, v_num=28vh, train_loss=3.160]


In [63]:
test_data = test_dataset.__getitem__(1122)
generate_output = model_pl.decode(test_data["src"].unsqueeze(0))

In [64]:
input = " ".join([list(en_vocab.keys())[list(en_vocab.values()).index(i)] for i in test_data["src"]])
target = " ".join([list(fr_vocab.keys())[list(fr_vocab.values()).index(i)] for i in test_data["trg"]])
model_output = " ".join([list(fr_vocab.keys())[list(fr_vocab.values()).index(i)] for i in generate_output])

In [65]:
print(input)
print(target)
print(model_output)

we re sorry for what happened to you . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[SOS] nous sommes desoles pour ce qui vous est arrive . [EOS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[SOS] nous avons ce que nous sommes a ce que tu es pour ce que vous avez . [EOS]


In [66]:
trainer.save_checkpoint("transformer.pth")