Gossiping-QA-Dataset : https://www.kaggle.com/zake7749/pttgossipingcorpus

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('Gossiping-QA-Dataset-2_0.csv')

In [3]:
df.head()

Unnamed: 0,question,answer
0,為什麼 聖結石 會被酸而 這群人 不會？,質感 劇本 成員 都差很多好嗎 不要拿腎結石來污辱這群人
1,為什麼慶祝228會被罵可是慶端午不會？,因為屈原不是台灣人，是楚國人。
2,有沒有戰神阿瑞斯的八卦?,爵士就是阿瑞斯 男主角最後死了
3,理論與實務最脫節的系,哪個系不脫節...你問最不脫節的簡單多了...
4,為什麼PTT這麼多人看棒球,肥宅才看棒球　系壘一堆胖子


In [4]:
df['question'][0]

'為什麼 聖結石 會被酸而 這群人 不會？'

In [5]:
df['answer'][0]

'質感 劇本 成員 都差很多好嗎 不要拿腎結石來污辱這群人'

In [6]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import BertTokenizer, BertModel

import numpy as np
import random
import math

In [7]:
g_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
g_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") #'bert-base-uncased'
g_vocab_size = g_tokenizer.vocab_size
g_max_input_length = g_tokenizer.max_model_input_sizes['bert-base-chinese']  # 512
g_bert = BertModel.from_pretrained('bert-base-chinese')
g_bert_emb_dim = g_bert.config.to_dict()['hidden_size']

In [9]:
def tokenize_and_cut(sentence):
    tokens = g_tokenizer.tokenize(sentence)
    tokens = tokens[:g_max_input_length-2]
    return tokens

In [10]:
g_vocab_size

21128

In [11]:
class QADataset(Dataset):
    def __init__(self, train_Q, train_A):
        
        self.train_Q = train_Q
        self.train_A = train_A
        self.length = len(train_Q)
        
    def __getitem__(self, index):
           
        Q_token = tokenize_and_cut(self.train_Q[index])
        A_token = tokenize_and_cut(self.train_A[index])
        
        Q_index = g_tokenizer.convert_tokens_to_ids(Q_token)
        A_index = g_tokenizer.convert_tokens_to_ids(A_token)
        
        Q_tensor = torch.tensor([Q_index]).transpose(0, 1).to(g_device)
        A_tensor = torch.tensor([A_index]).transpose(0, 1).to(g_device)
        
        return Q_tensor, A_tensor

    def __len__(self):

        return self.length

In [12]:
train_set = QADataset(df['question'], df['answer'])

In [13]:
train_set.__getitem__(0)

(tensor([[4158],
         [ 784],
         [7938],
         [5469],
         [5178],
         [4767],
         [3298],
         [6158],
         [7000],
         [5445],
         [6857],
         [5408],
         [ 782],
         [ 679],
         [3298],
         [8043]], device='cuda:0'),
 tensor([[6549],
         [2697],
         [1206],
         [3315],
         [2768],
         [1519],
         [6963],
         [2345],
         [2523],
         [1914],
         [1962],
         [1621],
         [ 679],
         [6206],
         [2897],
         [5575],
         [5178],
         [4767],
         [ 889],
         [3738],
         [6802],
         [6857],
         [5408],
         [ 782]], device='cuda:0'))

In [14]:
def create_mini_batch(samples):

    Q_tensor = [s[0] for s in samples]
    A_tensor = [s[1] for s in samples]
    
    Q_tensor = pad_sequence(Q_tensor)
    A_tensor = pad_sequence(A_tensor)
    
    masks_Q_tensors = torch.zeros(Q_tensor.size(), dtype=torch.long).to(g_device)
    masks_Q_tensors = masks_Q_tensors.masked_fill(Q_tensor != 0, 1)
    
    masks_A_tensors = torch.zeros(A_tensor.size(), dtype=torch.long).to(g_device)
    masks_A_tensors = masks_A_tensors.masked_fill(A_tensor != 0, 1)
    
    return Q_tensor, A_tensor, masks_Q_tensors, masks_A_tensors

In [15]:
len(df)

774114

In [16]:
temp = 5000
t_set, v_set = torch.utils.data.random_split(train_set, [temp, len(df)-temp])

In [35]:
train_data = DataLoader(t_set, shuffle=True, batch_size=10, collate_fn=create_mini_batch)

model: https://github.com/demi6od/ChatBot/blob/master/image/ChatBotBertTransformer.jpg

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [19]:
class TransBertEncoder(nn.Module):
    def __init__(self, nhead=8, nlayers=6, dropout=0.5):
        super().__init__()

        # bert encoder
        self.bert = g_bert

        # transformer encoder, as bert last layer fine-tune
        self.pos_encoder = PositionalEncoding(g_bert_emb_dim, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model=g_bert_emb_dim, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)

    def forward(self, src, mask):
        # src = [src len, batch size]

        with torch.no_grad():
            # embedded = [src len, batch size, emb dim]
            embedded = self.bert(src.transpose(0, 1), attention_mask=mask.transpose(0, 1))[0].transpose(0, 1)

        # embedded = self.pos_encoder(embedded)

        # src_mask = nn.Transformer().generate_square_subsequent_mask(len(embedded)).to(g_device)

        # outputs = [src len, batch size, hid dim * n directions]
        outputs = self.transformer_encoder(embedded)

        return outputs

In [20]:
class TransBertDecoder(nn.Module):
    def __init__(self, nhead=8, nlayers=6, dropout=0.5):
        super().__init__()

        # bert encoder
        self.bert = g_bert

        self.pos_decoder = PositionalEncoding(g_bert_emb_dim, dropout)
        decoder_layer = nn.TransformerDecoderLayer(d_model=g_bert_emb_dim, nhead=nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=nlayers)

        self.fc_out = nn.Linear(g_bert_emb_dim, g_vocab_size)

    def forward(self, tgt, mask, meaning, teacher_forcing_ratio = 0.01):
        # tgt = [output_len, batch size]

        output_len = tgt.size(0)
        batch_size = tgt.size(1)
        # decide if we are going to use teacher forcing or not
        teacher_force = random.random() < teacher_forcing_ratio

        if teacher_force and self.training:
            tgt_emb_total = torch.zeros(output_len, batch_size, g_bert_emb_dim).to(g_device)

            for t in range(0, output_len):
                with torch.no_grad():
                    tgt_emb = self.bert(tgt[:t+1].transpose(0, 1), attention_mask=mask.transpose(0, 1))[0].transpose(0, 1)
                tgt_emb_total[t] = tgt_emb[-1]

            tgt_mask = nn.Transformer().generate_square_subsequent_mask(len(tgt_emb_total)).to(g_device)
            decoder_output = self.transformer_decoder(tgt=tgt_emb_total,
                                                      memory=meaning,
                                                      tgt_mask=tgt_mask)
            predictions = self.fc_out(decoder_output)
        else:
            # initialized the input of the decoder with sos_idx (start of sentence token idx)
            output = torch.full((output_len+1, batch_size), g_tokenizer.cls_token_id, dtype=torch.long, device=g_device)
            predictions = torch.zeros(output_len, batch_size, g_vocab_size).to(g_device)

            for t in range(0, output_len):
                with torch.no_grad():
                    tgt_emb = self.bert(output[:t+1].transpose(0, 1), attention_mask=mask[:t+1].transpose(0, 1))[0].transpose(0, 1)

                # tgt_emb = [t, batch size, emb dim]
                # tgt_emb = self.pos_encoder(tgt_emb)

                tgt_mask = nn.Transformer().generate_square_subsequent_mask(len(tgt_emb)).to(g_device)

                # decoder_output = [t, batch size, emb dim]
                decoder_output = self.transformer_decoder(tgt=tgt_emb,
                                                          memory=meaning,
                                                          tgt_mask=tgt_mask)

                # prediction = [batch size, vocab size]
                prediction = self.fc_out(decoder_output[-1])

                # predictions = [output_len, batch size, vocab size]
                predictions[t] = prediction

                one_hot_idx = prediction.argmax(1)

                # output  = [output len, batch size]
                output[t+1] = one_hot_idx

        return predictions

In [21]:
class GruEncoder(nn.Module):
    """compress the request embeddings to meaning"""

    def __init__(self, hidden_size, input_size):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size)

    def forward(self, input):
        output, hidden = self.gru(input)
        return hidden

In [22]:
class GruDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.gru = nn.GRU(output_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, src, tgt, hidden):
        # first input to the decoder is the <CLS> tokens
        fc_output = src[0].unsqueeze(0)
        tgt_len = tgt.size(0)
        batch_size = tgt.size(1)

        # tensor to store decoder outputs
        outputs = torch.zeros(tgt_len, batch_size, g_bert_emb_dim).to(g_device)

        for t in range(0, tgt_len):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            gru_output, hidden = self.gru(fc_output, hidden)

            fc_output = self.fc(gru_output)

            # place predictions in a tensor holding predictions for each token
            outputs[t] = fc_output
        return outputs

In [23]:
class DialogDNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.5):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        # ResNet, dropout on first 3 layers
        input = self.dropout(input)

        output = input + F.relu(self.fc1(input))
        output = self.dropout(output)

        output = output + F.relu(self.fc2(output))
        output = self.dropout(output)

        output = output + self.fc3(output)  # no relu to keep negative values

        return output


In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, transbert_encoder, transbert_decoder, gru_encoder, gru_decoder, dialog_dnn):
        super().__init__()

        self.transbert_encoder = transbert_encoder
        self.transbert_decoder = transbert_decoder

        self.gru_encoder = gru_encoder
        self.gru_decoder = gru_decoder

        self.dialog_dnn = dialog_dnn

    def forward(self, src, tgt, mask_src, mask_tgt, teacher_forcing_ratio):
        request_embeddings = self.transbert_encoder(src,mask_src)
        request_meaning = self.gru_encoder(request_embeddings)

        if TRAIN_DIALOG:
            response_meaning = self.dialog_dnn(request_meaning)
        else:
            response_meaning = request_meaning   

        response_embeddings = self.gru_decoder(request_embeddings, tgt, response_meaning)
        response = self.transbert_decoder(tgt, mask_tgt, response_embeddings, teacher_forcing_ratio)

        return response

In [25]:
def print_chat(sentences):
    print("chatbot: ", end="")
    for word_embeds in sentences:
        word_embed = word_embeds[0]
        # find one shot index from word embedding
        max_idx_t = word_embed.argmax()
        max_idx = max_idx_t.item()
        word = g_tokenizer.convert_ids_to_tokens(max_idx)
        print(word, end=" ")
    print("")  # new line at the end of sentence


def print_index_tensor(sentences):
    print("target: ", end="")
    for word_embeds in sentences:
        word_embed = word_embeds[0]
        max_idx = word_embed.item()
        word = g_tokenizer.convert_ids_to_tokens(max_idx)
        print(word, end=" ")
    print("")  # new line at the end of sentence

In [26]:
INPUT_DIM = g_vocab_size
OUTPUT_DIM = g_vocab_size
ENC_EMB_DIM = g_bert_emb_dim
DEC_EMB_DIM = g_bert_emb_dim
HID_DIM = 2048  # 5 * 200
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
TRANSFORMER_ENCODER_LAYER = 1  # bert fine-tune
TRANSFORMER_DECODER_LAYER = 3  # semantics -> morphology -> syntax
TRANSFORMER_HEAD = 8

transbert_encoder = TransBertEncoder(TRANSFORMER_HEAD, TRANSFORMER_ENCODER_LAYER, ENC_DROPOUT)
transbert_decoder = TransBertDecoder(TRANSFORMER_HEAD, TRANSFORMER_DECODER_LAYER, DEC_DROPOUT)
gru_encoder = GruEncoder(HID_DIM, ENC_EMB_DIM)
gru_decoder = GruDecoder(HID_DIM, DEC_EMB_DIM)
dialog_dnn = DialogDNN(HID_DIM, HID_DIM, HID_DIM)

In [27]:
g_model = Seq2Seq(transbert_encoder, transbert_decoder, gru_encoder, gru_decoder, dialog_dnn).to(g_device)

In [28]:
g_model

Seq2Seq(
  (transbert_encoder): TransBertEncoder(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
    

In [29]:
import torch.optim as optim

In [30]:
optimizer = optim.Adam(g_model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index=g_tokenizer.pad_token_id)

In [31]:
# # src = [src len, batch size]
# # tgt = [output_len, batch size]

# with torch.no_grad():
#     x,y = train_set.__getitem__(1)
#     print(x.size(), y.size())

#     optimizer.zero_grad()
#     output =  g_model(x,y, 0.01)
#     print(output.size())
#     print("--------------")
#     output_dim = output.shape[-1]
#     print(output[:-1].view(-1, output_dim).size(),y[:-1].view(-1).size())
#     loss = loss_fn(output[:-1].view(-1, output_dim), y[:-1].view(-1))
#     print(loss)
#     print("--------------")
#     print_index_tensor(x)
#     print_chat(output)
#     print_index_tensor(y)



In [32]:
len(df)

774114

In [33]:
len(train_data)

625

In [34]:
iter = len(train_data)

In [39]:
# g_model = torch.load('./model_test2')

In [None]:
TRAIN_DIALOG = True
for e in range(6): #lucky number
    
    mean_loss = 0
    count = 0
    
    for input, target ,mask_Q ,mask_A in train_data:
        
        try:

            optimizer.zero_grad()

            target = target.view(target.shape[0],target.shape[1])
            input = input.view(input.shape[0],input.shape[1])
            mask_Q = mask_Q.view(mask_Q.shape[0],mask_Q.shape[1])
            mask_A = mask_A.view(mask_A.shape[0],mask_A.shape[1])

            output =  g_model(input, target, mask_Q, mask_A,0.01)
            output_dim = output.shape[-1]
            loss = loss_fn(output[:-1].view(-1, output_dim), target[:-1].view(-1))
            loss.backward()
            optimizer.step()

            count += 1
            mean_loss+=loss
     
        except RuntimeError: 
            print('error.....')
            print_index_tensor(input)
            print_index_tensor(target)
        else:
            print("e : " + str(e) + ' ' + str(count) + '/' + str(iter) + ' loss : ' + str(loss))

    print("e : "+str(e)+" L : "+str(mean_loss/count))
        
    torch.save(g_model, "./model_"+str(e))

e : 0 1/625 loss : tensor(7.3564, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 2/625 loss : tensor(7.0149, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 3/625 loss : tensor(7.0279, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 4/625 loss : tensor(6.9637, device='cuda:0', grad_fn=<NllLossBackward>)
error.....
target: 西 門 町 有 啥 可 以 吸 引 韓 國 人 的 [PAD] [PAD] [PAD] [PAD] [PAD] 
target: 別 的 不 說 挺 那 種 舔 中 的 台 商 幹 嘛 ? 被 韓 國 幹 翻 剛 好 啦 
e : 0 5/625 loss : tensor(6.5678, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 6/625 loss : tensor(6.8087, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 7/625 loss : tensor(6.8763, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 8/625 loss : tensor(6.7949, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 9/625 loss : tensor(6.8268, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 10/625 loss : tensor(6.7032, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 11/625 loss : tensor(7.2274, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 12/

e : 0 102/625 loss : tensor(6.9614, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 103/625 loss : tensor(6.6083, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 104/625 loss : tensor(6.7387, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 105/625 loss : tensor(7.0045, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 106/625 loss : tensor(7.4754, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 107/625 loss : tensor(7.0446, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 108/625 loss : tensor(6.8761, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 109/625 loss : tensor(7.7759, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 110/625 loss : tensor(7.0460, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 111/625 loss : tensor(7.0575, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 112/625 loss : tensor(7.0645, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 113/625 loss : tensor(7.2238, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 114/625 loss : tensor(6.7405, devi

e : 0 198/625 loss : tensor(6.7741, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 199/625 loss : tensor(6.8281, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 200/625 loss : tensor(6.9258, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 201/625 loss : tensor(6.6880, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 202/625 loss : tensor(6.7769, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 203/625 loss : tensor(6.6925, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 204/625 loss : tensor(6.8381, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 205/625 loss : tensor(6.8472, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 206/625 loss : tensor(7.2141, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 207/625 loss : tensor(7.4216, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 208/625 loss : tensor(6.6036, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 209/625 loss : tensor(6.9549, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 210/625 loss : tensor(7.2261, devi

e : 0 301/625 loss : tensor(7.0390, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 302/625 loss : tensor(6.8327, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 303/625 loss : tensor(7.2119, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 304/625 loss : tensor(7.1793, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 305/625 loss : tensor(6.7810, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 306/625 loss : tensor(6.6684, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 307/625 loss : tensor(6.9378, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 308/625 loss : tensor(6.8061, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 309/625 loss : tensor(6.7397, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 310/625 loss : tensor(6.5090, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 311/625 loss : tensor(6.8479, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 312/625 loss : tensor(6.7731, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 313/625 loss : tensor(6.9180, devi

e : 0 399/625 loss : tensor(7.1976, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 400/625 loss : tensor(6.8766, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 401/625 loss : tensor(7.0448, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 402/625 loss : tensor(6.9375, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 403/625 loss : tensor(6.9852, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 404/625 loss : tensor(7.1873, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 405/625 loss : tensor(7.3761, device='cuda:0', grad_fn=<NllLossBackward>)
error.....
target: 志 願 役 士 兵 罷 工 有 沒 有 搞 頭 ? [PAD] [PAD] [PAD] [PAD] 
target: 認 真 講 不 可 行 因 為 平 常 就 都 沒 在 做 事 幹 部 不 罷 工 都 沒 用 
e : 0 406/625 loss : tensor(7.1163, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 407/625 loss : tensor(7.2544, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 408/625 loss : tensor(6.8829, device='cuda:0', grad_fn=<NllLossBackward>)
e : 0 409/625 loss : tensor(6.9917, device='cuda:0', grad_fn=<NllLossBack

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


e : 1 1/625 loss : tensor(6.5021, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 2/625 loss : tensor(6.9220, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 3/625 loss : tensor(7.1335, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 4/625 loss : tensor(7.3327, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 5/625 loss : tensor(7.4604, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 6/625 loss : tensor(6.6294, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 7/625 loss : tensor(7.3395, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 8/625 loss : tensor(6.9647, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 9/625 loss : tensor(6.9740, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 10/625 loss : tensor(7.2192, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 11/625 loss : tensor(6.9281, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 12/625 loss : tensor(7.0730, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 13/625 loss : tensor(7.1475, device='cuda:0', grad_fn=<

e : 1 105/625 loss : tensor(6.7122, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 106/625 loss : tensor(6.9313, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 107/625 loss : tensor(6.4441, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 108/625 loss : tensor(7.4888, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 109/625 loss : tensor(6.6597, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 110/625 loss : tensor(6.7598, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 111/625 loss : tensor(7.4437, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 112/625 loss : tensor(6.5585, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 113/625 loss : tensor(6.8776, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 114/625 loss : tensor(7.0595, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 115/625 loss : tensor(6.9951, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 116/625 loss : tensor(6.7630, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 117/625 loss : tensor(6.8450, devi

e : 1 204/625 loss : tensor(6.8764, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 205/625 loss : tensor(7.1254, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 206/625 loss : tensor(6.7885, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 207/625 loss : tensor(6.8491, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 208/625 loss : tensor(7.0322, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 209/625 loss : tensor(6.4990, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 210/625 loss : tensor(7.0226, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 211/625 loss : tensor(6.5720, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 212/625 loss : tensor(6.5796, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 213/625 loss : tensor(6.7841, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 214/625 loss : tensor(6.8606, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 215/625 loss : tensor(7.1759, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 216/625 loss : tensor(6.9123, devi

e : 1 303/625 loss : tensor(6.9732, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 304/625 loss : tensor(7.1789, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 305/625 loss : tensor(6.8477, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 306/625 loss : tensor(6.8409, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 307/625 loss : tensor(7.1421, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 308/625 loss : tensor(7.1709, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 309/625 loss : tensor(7.2699, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 310/625 loss : tensor(6.5404, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 311/625 loss : tensor(6.6609, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 312/625 loss : tensor(7.2849, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 313/625 loss : tensor(6.7024, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 314/625 loss : tensor(7.2270, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 315/625 loss : tensor(6.6824, devi

e : 1 403/625 loss : tensor(6.4192, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 404/625 loss : tensor(7.2382, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 405/625 loss : tensor(6.6361, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 406/625 loss : tensor(6.8931, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 407/625 loss : tensor(6.8883, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 408/625 loss : tensor(7.2113, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 409/625 loss : tensor(7.0628, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 410/625 loss : tensor(6.9881, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 411/625 loss : tensor(6.9800, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 412/625 loss : tensor(6.8679, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 413/625 loss : tensor(7.1693, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 414/625 loss : tensor(6.6277, device='cuda:0', grad_fn=<NllLossBackward>)
e : 1 415/625 loss : tensor(7.1683, devi

e : 2 11/625 loss : tensor(6.8615, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 12/625 loss : tensor(6.0953, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 13/625 loss : tensor(7.0873, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 14/625 loss : tensor(6.9222, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 15/625 loss : tensor(6.7032, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 16/625 loss : tensor(6.9669, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 17/625 loss : tensor(7.0304, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 18/625 loss : tensor(7.1591, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 19/625 loss : tensor(7.1365, device='cuda:0', grad_fn=<NllLossBackward>)
error.....
target: 有 甲 甲 是 被 女 森 傷 害 過 才 變 [UNK] 的 嗎 ？ [PAD] [PAD] [PAD] 
target: 有 ， 你 不 知 道 而 已 [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
e : 2 20/625 loss : tensor(6.8433, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 21/625 loss : tensor(7.5701, device='cuda:0',

e : 2 113/625 loss : tensor(6.9587, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 114/625 loss : tensor(7.1006, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 115/625 loss : tensor(7.2038, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 116/625 loss : tensor(6.8603, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 117/625 loss : tensor(6.9437, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 118/625 loss : tensor(7.1634, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 119/625 loss : tensor(7.0268, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 120/625 loss : tensor(6.6473, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 121/625 loss : tensor(7.0845, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 122/625 loss : tensor(7.1081, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 123/625 loss : tensor(6.8316, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 124/625 loss : tensor(7.0268, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 125/625 loss : tensor(7.1580, devi

e : 2 214/625 loss : tensor(6.8105, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 215/625 loss : tensor(6.7263, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 216/625 loss : tensor(6.3459, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 217/625 loss : tensor(6.8744, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 218/625 loss : tensor(6.4938, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 219/625 loss : tensor(6.8174, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 220/625 loss : tensor(7.4233, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 221/625 loss : tensor(7.0270, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 222/625 loss : tensor(7.2044, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 223/625 loss : tensor(7.1209, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 224/625 loss : tensor(7.1518, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 225/625 loss : tensor(6.9470, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 226/625 loss : tensor(7.4410, devi

e : 2 315/625 loss : tensor(6.7792, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 316/625 loss : tensor(6.7313, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 317/625 loss : tensor(7.2233, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 318/625 loss : tensor(7.0331, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 319/625 loss : tensor(6.8887, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 320/625 loss : tensor(7.1727, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 321/625 loss : tensor(6.7950, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 322/625 loss : tensor(6.6792, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 323/625 loss : tensor(7.2157, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 324/625 loss : tensor(7.3831, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 325/625 loss : tensor(6.8548, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 326/625 loss : tensor(7.4362, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 327/625 loss : tensor(7.3202, devi

e : 2 418/625 loss : tensor(6.9733, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 419/625 loss : tensor(7.3505, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 420/625 loss : tensor(6.8626, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 421/625 loss : tensor(7.4980, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 422/625 loss : tensor(7.0736, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 423/625 loss : tensor(6.7335, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 424/625 loss : tensor(7.1334, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 425/625 loss : tensor(6.7107, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 426/625 loss : tensor(7.0125, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 427/625 loss : tensor(6.9195, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 428/625 loss : tensor(7.0932, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 429/625 loss : tensor(6.7069, device='cuda:0', grad_fn=<NllLossBackward>)
e : 2 430/625 loss : tensor(6.5930, devi

e : 3 23/625 loss : tensor(7.1521, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 24/625 loss : tensor(6.8707, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 25/625 loss : tensor(6.6240, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 26/625 loss : tensor(6.7494, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 27/625 loss : tensor(7.3502, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 28/625 loss : tensor(6.6453, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 29/625 loss : tensor(6.4944, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 30/625 loss : tensor(6.8365, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 31/625 loss : tensor(6.8966, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 32/625 loss : tensor(6.8383, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 33/625 loss : tensor(6.7586, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 34/625 loss : tensor(6.8002, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 35/625 loss : tensor(6.9773, device='cuda:0', 

e : 3 127/625 loss : tensor(7.0409, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 128/625 loss : tensor(6.8490, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 129/625 loss : tensor(7.1948, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 130/625 loss : tensor(6.5625, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 131/625 loss : tensor(7.1155, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 132/625 loss : tensor(6.6142, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 133/625 loss : tensor(6.6627, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 134/625 loss : tensor(6.5393, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 135/625 loss : tensor(6.9275, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 136/625 loss : tensor(7.0320, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 137/625 loss : tensor(6.3669, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 138/625 loss : tensor(7.1534, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 139/625 loss : tensor(6.6030, devi

e : 3 228/625 loss : tensor(6.7916, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 229/625 loss : tensor(7.0606, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 230/625 loss : tensor(6.9090, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 231/625 loss : tensor(6.3336, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 232/625 loss : tensor(6.7325, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 233/625 loss : tensor(6.5269, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 234/625 loss : tensor(7.2391, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 235/625 loss : tensor(7.2781, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 236/625 loss : tensor(7.1114, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 237/625 loss : tensor(6.9806, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 238/625 loss : tensor(6.8997, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 239/625 loss : tensor(6.9634, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 240/625 loss : tensor(6.6642, devi

e : 3 329/625 loss : tensor(6.7082, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 330/625 loss : tensor(7.0061, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 331/625 loss : tensor(6.6082, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 332/625 loss : tensor(6.9078, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 333/625 loss : tensor(6.7248, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 334/625 loss : tensor(6.6528, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 335/625 loss : tensor(7.0876, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 336/625 loss : tensor(6.7146, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 337/625 loss : tensor(6.9251, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 338/625 loss : tensor(6.3667, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 339/625 loss : tensor(6.8637, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 340/625 loss : tensor(6.9354, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 341/625 loss : tensor(6.5863, devi

e : 3 426/625 loss : tensor(6.7326, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 427/625 loss : tensor(6.4070, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 428/625 loss : tensor(6.7218, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 429/625 loss : tensor(7.0126, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 430/625 loss : tensor(6.7226, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 431/625 loss : tensor(6.6419, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 432/625 loss : tensor(6.9699, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 433/625 loss : tensor(7.4059, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 434/625 loss : tensor(6.8947, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 435/625 loss : tensor(6.9585, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 436/625 loss : tensor(7.1301, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 437/625 loss : tensor(6.8943, device='cuda:0', grad_fn=<NllLossBackward>)
e : 3 438/625 loss : tensor(6.6931, devi

e : 4 30/625 loss : tensor(6.7614, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 31/625 loss : tensor(6.9963, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 32/625 loss : tensor(7.0675, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 33/625 loss : tensor(6.6310, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 34/625 loss : tensor(6.7151, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 35/625 loss : tensor(6.7451, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 36/625 loss : tensor(7.1476, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 37/625 loss : tensor(6.7531, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 38/625 loss : tensor(7.2929, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 39/625 loss : tensor(7.1619, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 40/625 loss : tensor(7.4402, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 41/625 loss : tensor(7.1180, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 42/625 loss : tensor(6.9271, device='cuda:0', 

e : 4 130/625 loss : tensor(6.5467, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 131/625 loss : tensor(6.9681, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 132/625 loss : tensor(7.0499, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 133/625 loss : tensor(6.8583, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 134/625 loss : tensor(7.2291, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 135/625 loss : tensor(6.6435, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 136/625 loss : tensor(7.0897, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 137/625 loss : tensor(7.0418, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 138/625 loss : tensor(6.9909, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 139/625 loss : tensor(6.8604, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 140/625 loss : tensor(7.4128, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 141/625 loss : tensor(7.3078, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 142/625 loss : tensor(6.9488, devi

e : 4 233/625 loss : tensor(7.0583, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 234/625 loss : tensor(7.1648, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 235/625 loss : tensor(6.9150, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 236/625 loss : tensor(6.7480, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 237/625 loss : tensor(6.5896, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 238/625 loss : tensor(7.0469, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 239/625 loss : tensor(7.3577, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 240/625 loss : tensor(7.0544, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 241/625 loss : tensor(6.8878, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 242/625 loss : tensor(6.9193, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 243/625 loss : tensor(6.9994, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 244/625 loss : tensor(7.3259, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 245/625 loss : tensor(7.2603, devi

e : 4 331/625 loss : tensor(6.8156, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 332/625 loss : tensor(7.3461, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 333/625 loss : tensor(6.9499, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 334/625 loss : tensor(6.6775, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 335/625 loss : tensor(7.3900, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 336/625 loss : tensor(7.0873, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 337/625 loss : tensor(7.0009, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 338/625 loss : tensor(6.6240, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 339/625 loss : tensor(6.8151, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 340/625 loss : tensor(6.8451, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 341/625 loss : tensor(6.7660, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 342/625 loss : tensor(6.9286, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 343/625 loss : tensor(7.0601, devi

e : 4 430/625 loss : tensor(6.2408, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 431/625 loss : tensor(6.6656, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 432/625 loss : tensor(6.9829, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 433/625 loss : tensor(6.4516, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 434/625 loss : tensor(6.6115, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 435/625 loss : tensor(7.1332, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 436/625 loss : tensor(6.7829, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 437/625 loss : tensor(6.7868, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 438/625 loss : tensor(6.3748, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 439/625 loss : tensor(7.3323, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 440/625 loss : tensor(6.6279, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 441/625 loss : tensor(6.6482, device='cuda:0', grad_fn=<NllLossBackward>)
e : 4 442/625 loss : tensor(6.9714, devi

e : 5 38/625 loss : tensor(7.1508, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 39/625 loss : tensor(7.0545, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 40/625 loss : tensor(6.6904, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 41/625 loss : tensor(7.1424, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 42/625 loss : tensor(6.6092, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 43/625 loss : tensor(6.8716, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 44/625 loss : tensor(6.9475, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 45/625 loss : tensor(6.8815, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 46/625 loss : tensor(7.1763, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 47/625 loss : tensor(6.4397, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 48/625 loss : tensor(6.9375, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 49/625 loss : tensor(6.7986, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 50/625 loss : tensor(6.6371, device='cuda:0', 

e : 5 138/625 loss : tensor(6.6478, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 139/625 loss : tensor(6.6217, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 140/625 loss : tensor(6.8796, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 141/625 loss : tensor(7.2172, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 142/625 loss : tensor(7.0280, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 143/625 loss : tensor(6.8155, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 144/625 loss : tensor(6.6707, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 145/625 loss : tensor(6.8230, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 146/625 loss : tensor(6.6454, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 147/625 loss : tensor(6.6818, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 148/625 loss : tensor(7.0070, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 149/625 loss : tensor(6.8752, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 150/625 loss : tensor(6.7464, devi

e : 5 239/625 loss : tensor(6.8934, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 240/625 loss : tensor(6.7519, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 241/625 loss : tensor(6.4923, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 242/625 loss : tensor(6.8189, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 243/625 loss : tensor(6.9918, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 244/625 loss : tensor(7.0374, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 245/625 loss : tensor(6.7327, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 246/625 loss : tensor(6.6089, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 247/625 loss : tensor(6.9075, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 248/625 loss : tensor(6.9567, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 249/625 loss : tensor(6.6951, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 250/625 loss : tensor(7.4271, device='cuda:0', grad_fn=<NllLossBackward>)
e : 5 251/625 loss : tensor(7.0482, devi

In [66]:
# torch.save(g_model, "./model_test2")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [45]:
test_data = DataLoader(v_set, shuffle=True, batch_size=10, collate_fn=create_mini_batch)

In [48]:
TRAIN_DIALOG = False
for input, target ,mask_Q ,mask_A in test_data:
    optimizer.zero_grad()

    target = target.view(target.shape[0],target.shape[1])
    input = input.view(input.shape[0],input.shape[1])
    mask_Q = mask_Q.view(mask_Q.shape[0],mask_Q.shape[1])
    mask_A = mask_A.view(mask_A.shape[0],mask_A.shape[1])

    output =  g_model(input, target, mask_Q, mask_A,0.01)
    
    print_index_tensor(input)
    print_index_tensor(target)
    print_chat(output)
    
    break

target: [UNK] 有 洩 漏 過 誰 的 個 資 嗎 ? [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
target: 小 時 候 不 念 書 長 大 當 記 者 沒 法 度 [PAD] [PAD] [PAD] [PAD] 
chatbot: 的 的 的 的 不 的 不 的 不 一 的 的 不 的 不 不 的 的 
