In [None]:
# !pip install lightning wandb

In [1]:
import os 
import re
import shutil
import zipfile

import numpy as np
import unicodedata
import urllib3
import random
import math

In [2]:
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        print(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
output_path = "fra-eng.zip"
download_zip(url, output_path)

path = os.getcwd()
zipfilename = os.path.join(path, output_path)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

ZIP file downloaded to fra-eng.zip


In [3]:
def to_ascii(s):
  # 프랑스어 악센트(accent) 삭제
  # 예시 : 'déjà diné' -> deja dine
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  # 악센트 제거 함수 호출
  sent = to_ascii(sent.lower())

  # 단어와 구두점 사이에 공백 추가.
  # ex) "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환.
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # 다수 개의 공백을 하나의 공백으로 치환
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [4]:
# 전처리 테스트
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 :', en_sent)
print('전처리 후 영어 문장 :',preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장 :', fr_sent)
print('전처리 후 프랑스어 문장 :', preprocess_sentence(fr_sent))

전처리 전 영어 문장 : Have you had dinner?
전처리 후 영어 문장 : have you had dinner ?
전처리 전 프랑스어 문장 : Avez-vous déjà diné?
전처리 후 프랑스어 문장 : avez vous deja dine ?


In [5]:
def load_preprocessed_data():
  encoder_input, decoder_input = [], []

  with open("fra.txt", "r") as lines:
    for i, line in enumerate(lines):
      # source 데이터와 target 데이터 분리
      src_line, tar_line, _ = line.strip().split('\t')

      # source 데이터 전처리
      src_line = [w for w in preprocess_sentence(src_line).split()]

      # target 데이터 전처리
      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("[SOS] " + tar_line + " [EOS]").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)

  return encoder_input, decoder_input

In [6]:
sents_en_in, sents_fra_in = load_preprocessed_data()
print('인코더의 입력 :',sents_en_in[-1:])
print('디코더의 입력 :',sents_fra_in[-1:])

인코더의 입력 : [['i', 'went', 'drinking', 'with', 'one', 'of', 'my', 'boyfriend', 's', 'friends', 'and', 'now', 'he', 's', 'furious', 'at', 'me', '.', 'was', 'this', 'friend', 'a', 'guy', 'or', 'a', 'girl', '?', 'a', 'guy', 'obviously', '.', 'why', 'would', 'i', 'go', 'drinking', 'with', 'his', 'female', 'friends', '?', 'yeah', 'you', 're', 'right', '.', 'his', 'name', 'is', 'tom', '.', 'he', 's', 'really', 'hot', 'and', 'i', 'really', 'want', 'to', 'go', 'drinking', 'with', 'him', 'again', '.']]
디코더의 입력 : [['[SOS]', 'je', 'suis', 'allee', 'boire', 'avec', 'un', 'ami', 'de', 'mon', 'compagnon', 'et', 'voila', 'qu', 'il', 'est', 'furieux', 'contre', 'moi', '.', 'etait', 'ce', 'un', 'gars', 'ou', 'une', 'fille', '?', 'un', 'gars', 'bien', 'evidemment', '.', 'pourquoi', 'irais', 'je', 'boire', 'avec', 'ses', 'amies', '?', 'ouais', 'ca', 'se', 'comprend', '.', 'il', 's', 'appelle', 'tom', '.', 'il', 'est', 'trop', 'canon', 'et', 'j', 'ai', 'tellement', 'envie', 'd', 'aller', 'prendre', 'un', 'v

In [27]:
len(sents_en_in), len(sents_fra_in)

(232736, 232736)

In [7]:
#split data -> train-validation-test로 구분
def split_data(data, train_ratio=0.7, shuffle=True):
    data = list(data)
    if shuffle:
        random.shuffle(data)
    n_train = int(len(data) * train_ratio)
    train_data = data[:n_train]
    test_data = data[n_train:]
    return train_data, test_data

# 데이터에서 적당한 크기로 분할
# 데이터가 적으면 보통 8:2
# 데이터가 많으면 보통 9:1
# validation data는 항상 train data에서!
train_test_ratio = 0.9
train, test = split_data(zip(sents_en_in, sents_fra_in), train_test_ratio)
train, vali = split_data(train, train_test_ratio)

In [8]:
len(train), len(vali), len(test)

(188515, 20947, 23274)

In [8]:
# make vocabulary
from collections import Counter

# 영어(인코더 입력)에 대한 vocab, 프랑스어(디코더 입력)에 대한 vocab
en_token_cnt = Counter()
fr_token_cnt = Counter()

for tokens, _ in train:
    en_token_cnt.update(tokens)

min_count = 2
en_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in en_token_cnt.items():
    if count > min_count and token not in en_vocab:
        en_vocab[token] = len(en_vocab)


for _, tokens in train:
    fr_token_cnt.update(tokens)

fr_vocab = {"[PAD]": 0, "[UNK]": 1, "[SOS]": 2, "[EOS]": 3}
for token, count in fr_token_cnt.items():
    if count > min_count and token not in fr_vocab:
        fr_vocab[token] = len(fr_vocab)

In [10]:
len(en_vocab), len(fr_vocab)

(8320, 11847)

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [10]:
class EnToFrDataset(Dataset):
    def __init__(self, data, en_vocab, fr_vocab):
        self.enc_input = []     # 인코더의 입력 영어 데이터
        self.dec_input = []     # 디코더의 입력 프랑스어 데이터
        self.dec_target = []
        for en_sent, fr_sent_in in data:
            self.enc_input.append(en_sent)
            self.dec_input.append(fr_sent_in)
        self.en_vocab = en_vocab
        self.fr_vocab = fr_vocab
        self.max_len = 30       # 데이터가 가질 수 있는 최대 길이

    def __len__(self):
        return len(self.enc_input)

    def __getitem__(self, idx):
        # 인코더와 디코더에 들어갈 샘플
        # source, target에 대한 index sequence
        src_sample = [self.en_vocab.get(w, self.en_vocab.get("[UNK]")) for w in self.enc_input[idx]]
        # print(src_sample)
        trg_sample = [self.fr_vocab.get(w, self.fr_vocab.get("[UNK]")) for w in self.dec_input[idx]]
        # print(trg_sample)
        # truncate and padding
        src_sample = src_sample[:self.max_len]
        trg_sample = trg_sample[:self.max_len]
        src_sample += [self.en_vocab.get("[PAD]")] * (self.max_len - len(src_sample))
        trg_sample += [self.fr_vocab.get("[PAD]")] * (self.max_len - len(trg_sample))

        # dictonary 형태로, 같은 key를 가진 것끼리 batch가 만들어진다.
        return {"src": torch.LongTensor(src_sample), "trg": torch.LongTensor(trg_sample)}

In [11]:
train_dataset = EnToFrDataset(train, en_vocab, fr_vocab)
vali_dataset = EnToFrDataset(vali, en_vocab, fr_vocab)
test_dataset = EnToFrDataset(test, en_vocab, fr_vocab)

train_loader = DataLoader(train_dataset, batch_size=64,drop_last=True, shuffle=True, num_workers=8)
vali_loader = DataLoader(vali_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=64,drop_last=True, shuffle=False, num_workers=8)

In [32]:
train_dataset.__getitem__(10120)

[4, 109, 53, 56, 575, 228, 4, 104, 524, 10]
[2, 4, 16, 679, 351, 118, 201, 239, 4, 20, 586, 10, 3]


{'src': tensor([  4, 109,  53,  56, 575, 228,   4, 104, 524,  10,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0]),
 'trg': tensor([  2,   4,  16, 679, 351, 118, 201, 239,   4,  20, 586,  10,   3,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0])}

In [36]:
# DataLoader 동작 Test
a = DataLoader(train_dataset, batch_size=2,drop_last=True, shuffle=True, num_workers=8)
for batch in a:
    print(batch)
    break

{'src': tensor([[  63,  212,   48, 2058,  180,   15,  517,   57,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [  84,   39,  149,   42, 1671, 1783,  461,   22,  281,   10,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'trg': tensor([[   2,   44,  598,  220,   29, 3294, 4503,   45,   63,    3,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [   2,   95,   44,   44,  257, 1294,   16,  201,  894,   48,   24,   67,
          299,   10,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]])}


In [12]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return outputs, hidden, cell

In [13]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)    # output_dim, output_dim 같은데 굳이 embedding과 linear 둘다 사용, 개선 고민해볼것!

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        # 인코더의 hidden, cell state를 디코더의 LSTM 입력으로
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [14]:
# decoder with simple dot product attention
class AttentionDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim*2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        # unsqueeze : 토큰이 하나씩 들어가기 때문에 차원을 맞추기 위한 연산
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # print(output.size())
        # print(encoder_outputs.size())

        attention_score = torch.bmm(output.squeeze(0).unsqueeze(1), encoder_outputs.permute(1, 2, 0)).squeeze(1)
        attention_distribution = torch.softmax(attention_score, dim=1)
        context = torch.bmm(attention_distribution.unsqueeze(1), encoder_outputs.permute(1, 0, 2)).squeeze(1)
        prediction = self.fc_out(torch.cat((output.squeeze(0), context), dim=1))
        # print(attention_score.size())
        # print(attention_distribution.size())
        # print(context.size())
        # print(prediction.size())

        return prediction, hidden, cell

In [15]:
from typing import Any
import lightning as pl

class Seq2Seq(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.criterian = nn.CrossEntropyLoss(ignore_index=0)
        self.automatic_optimization = False
        # 인코더, 디코더 따로 있기 때문에 auto_optimizer를 사용할 수 없다.
        self.save_hyperparameters()

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tar_len, batch_size, trg_vocab_size 만큼의 공간
        # 디코더의 출력을 저장하는 공간
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(trg.device)

        enc_output, hidden, cell = self.encoder(src)
        # print(enc_output.size())
        # print(hidden.size())
        # print(cell.size())
        
        # 하나의 token씩 입력을 넣어준다.
        input = trg[0,:]
        # print(input)
        
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)
                # print(output.size())
                # print(hidden.size())
                # print(cell.size())
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

    def training_step(self, batch, batch_idx):
        enc_opt, dec_opt = self.optimizers()

        enc_opt.zero_grad()
        dec_opt.zero_grad()

        src = batch["src"].permute(1, 0)    # LSTM에는 batch가 두번째로!
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.manual_backward(loss)
        enc_opt.step()
        dec_opt.step()

        self.log("train_loss", loss)
        self.log("train_PPL", math.exp(loss))   # PPL : Launguae Generation 할 때의 성능지표!
        return loss

    def validation_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("val_loss", loss)
        self.log("val_PPL", math.exp(loss))
        return loss

    def test_step(self, batch, batch_idx):
        src = batch["src"].permute(1, 0)
        trg = batch["trg"].permute(1, 0)

        outputs = self(src, trg, teacher_forcing_ratio=0)

        outputs_dim = outputs.shape[-1]
        outputs = outputs[1:].view(-1, outputs_dim)
        trg = trg[1:].reshape(-1)
        loss = self.criterian(outputs, trg)

        self.log("test_loss", loss)
        self.log("test_PPL", math.exp(loss))
        return loss

    # 디코더의 실제 출력을 확인
    def decode(self, src):
        enc_output, hidden, cell = self.encoder(src.unsqueeze(1))
        trg_len = 30
        trg_vocab_size = self.decoder.output_dim
        outputs = [2]
        input = torch.LongTensor([2]).to(src.device)
        for t in range(1, trg_len):
            if isinstance(self.decoder, AttentionDecoder):
                output, hidden, cell = self.decoder(input, hidden, cell, enc_output)
            else:
                output, hidden, cell = self.decoder(input, hidden, cell)
            top1 = output.argmax(1)
            outputs.append(top1.item())
            if top1.item() == 3:
                break
            input = top1
        return outputs

    def configure_optimizers(self):
        enc_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=1e-4)
        dec_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=1e-4)
        return enc_optimizer, dec_optimizer

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
emb_dim = 256
hid_dim = 512
n_layers = 2

In [17]:
encoder = Encoder(input_dim=len(en_vocab),
                  emb_dim=emb_dim,
                  hid_dim=hid_dim,
                  n_layers=n_layers,
                  dropout=0.5)

decoder = Decoder(output_dim=len(fr_vocab),
                    emb_dim=emb_dim,
                    hid_dim=hid_dim,
                    n_layers=n_layers,
                    dropout=0.5)

att_decoder = AttentionDecoder(output_dim=len(fr_vocab),
                            emb_dim=emb_dim,
                            hid_dim=hid_dim,
                            n_layers=n_layers,
                            dropout=0.5)

model = Seq2Seq(encoder, decoder)
att_model = Seq2Seq(encoder, att_decoder)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'encoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encoder'])`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'decoder' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['decoder'])`.


In [54]:
for batch in train_loader:
    src = batch["src"].permute(1,0)
    trg = batch["trg"].permute(1,0)
    model.forward(src=src, trg=trg)
    break

# 인코더의 출력들 예시
# sequence의 길이, batch_size, hidden_size
# [30, 64, 512] : 인코더의 output_size
# [2, 64, 512] : 인코더의 hidden_state_size
# [2, 64, 512] : 인코더의 cell_state_size

# [2, 2, ...] : token 하나씩 입력, 첫 시작 토큰은 [SOS]

# 디코더의 입력
# [64, 11843] : 디코더의 output_size, [batch, 프랑스어 토큰 사이즈]
# [2, 64, 512] : 디코더의 hidden_state_size
# [2, 64, 512] : 디코더의 cell_state_size


torch.Size([30, 64, 512])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64, 512])
torch.Size([2, 64, 512])
torch.Size([64, 11847])
torch.Size([2, 64

In [99]:
for batch in train_loader:
    src = batch["src"].permute(1,0)
    trg = batch["trg"].permute(1,0)
    att_model.forward(src=src, trg=trg)
    break

torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
torch.Size([30, 64, 512])
torch.Size([64, 30])
torch.Size([64, 30])
torch.Size([64, 512])
torch.Size([64, 11847])
torch.Size([1, 64, 512])
to

In [105]:
import wandb
from lightning.pytorch.loggers import WandbLogger

wandb.login()

wandb_logger = WandbLogger(project="NLP", name="Seq2Seq_att", group="Lec05")



In [106]:
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    logger=wandb_logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [61]:
trainer.fit(model, train_loader, vali_loader)

/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/dev/anaconda3/envs/nlp/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory ./NLP/yhvt2ljv/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | Decoder          | 12.8 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
18.6 M    Trainable params
0         Non-trainable params
18.6 M    Total params
74.387    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [03:45<00:00, 13.06it/s, v_num=2ljv]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [03:45<00:00, 13.04it/s, v_num=2ljv]


In [107]:
trainer.fit(att_model, train_loader, vali_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 5.8 M 
1 | decoder   | AttentionDecoder | 18.9 M
2 | criterian | CrossEntropyLoss | 0     
-----------------------------------------------
24.7 M    Trainable params
0         Non-trainable params
24.7 M    Total params
98.649    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 2945/2945 [05:00<00:00,  9.81it/s, v_num=kicz]   

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 2945/2945 [05:00<00:00,  9.79it/s, v_num=kicz]


In [62]:
trainer.test(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 363/363 [00:09<00:00, 37.58it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_PPL             95.06916809082031
        test_loss            4.547243118286133
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 4.547243118286133, 'test_PPL': 95.06916809082031}]

In [111]:
test_data = test_dataset.__getitem__(1000)
a = att_model.decode(test_data["src"])

In [112]:
input = " ".join([list(en_vocab.keys())[list(en_vocab.values()).index(i)] for i in test_data["src"]])
target = " ".join([list(fr_vocab.keys())[list(fr_vocab.values()).index(i)] for i in test_data["trg"]])
model_output = " ".join([list(fr_vocab.keys())[list(fr_vocab.values()).index(i)] for i in a])

In [113]:
print(input)
print(target)
print(model_output)

i m not that kind of guy . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[SOS] je ne suis pas ce genre de gars . [EOS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[SOS] je ne suis pas de de . . [EOS]


In [114]:
wandb.finish()

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_PPL,▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,▃▃▃█▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
val_PPL,▁
val_loss,▁

0,1
epoch,0.0
train_PPL,50.06936
train_loss,3.91341
trainer/global_step,2944.0
val_PPL,79.08675
val_loss,4.36085
