In [2]:
import pandas as pd
import numpy as np
import re
import sentencepiece as spm

In [3]:
# read train data
train_data = pd.read_csv('./dataset/ChatbotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [4]:
# collecting questions and answers
questions = []

for sentence in train_data['Q']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)

answers = []

for sentence in train_data['A']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)

In [5]:
# store sentence in all.txt
with open('all.txt', 'w', encoding="utf8") as f:
    f.write('\n'.join(questions))
    f.write('\n'.join(answers))

In [6]:
# train sentencepiece with all.txt
corpus = "all.txt"
prefix = 'chatbot'
vocab_size = 8000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=all.txt --model_prefix=chatbot --vocab_size=8007 --model_type=bpe --max_sentence_length=999999 --pad_id=0 --pad_piece=[PAD] --unk_id=1 --unk_piece=[UNK] --bos_id=2 --bos_piece=[BOS] --eos_id=3 --eos_piece=[EOS] --user_defined_symbols=[SEP],[CLS],[MASK]
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: all.txt
  input_format: 
  model_prefix: chatbot
  model_type: BPE
  vocab_size: 8007
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 999999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  user_defined_symbols: [SEP]
  user_defined_symbols: [CLS]
  user_defined_symbols: [MASK]
  required_chars: 
  

In [7]:
# test
vocab_file = "chatbot.model"
vocab = spm.SentencePieceProcessor()
vocab.Load(vocab_file)
line = "안녕하세요 만나서 반갑습니다."
pieces = vocab.EncodeAsPieces(line)
ibs = vocab.EncodeAsIds(line)

print(line)
print(pieces)
print(ibs)

안녕하세요 만나서 반갑습니다.
['▁안녕하세요', '▁만나서', '▁반갑', '습니다', '.']
[4626, 1930, 4849, 154, 6927]


In [8]:
MAX_LENGTH = 40

START_TOKEN = [2]
END_TOKEN = [3]

def tokenizeAndFilter(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []

    for (sentence1, sentence2) in zip(inputs, outputs):
            zeros1 = np.zeros(MAX_LENGTH, dtype=int)
    zeros2 = np.zeros(MAX_LENGTH, dtype=int)
    sentence1 = START_TOKEN + vocab.encode_as_ids(sentence1) + END_TOKEN
    zeros1[:len(sentence1)] = sentence1[:MAX_LENGTH]

    sentence2 = START_TOKEN + vocab.encode_as_ids(sentence2) + END_TOKEN
    zeros2[:len(sentence2)] = sentence2[:MAX_LENGTH]

    tokenized_inputs.append(zeros1)
    tokenized_outputs.append(zeros2)
    
    return tokenized_inputs, tokenized_outputs

In [9]:
questions_encode, answers_encode = tokenizeAndFilter(questions, answers)
print(questions_encode[0])
print(answers_encode[0])

[   2 2477  218 4321    3    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[   2  198 7121 7119 3069  395 1477  636    7    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [10]:
# dataset
from torch.utils.data import Dataset, DataLoader

class SquenceDataset(Dataset):
    def __init__(self, questions, answers):
        questions = np.array(questions)
        answers = np.array(answers)
        self.inputs = questions
        self.dec_inputs = answers[:, :-1]
        self.output = answers[:, 1:]
        self.length = len(questions)

    def __getitem__(self, idx):
        return (self.inputs[idx], self.dec_inputs[idx], self.output[idx])
    
    def __len__(self):
        return self.length
    
BATCH_SIZE = 64
dataset = SquenceDataset(questions_encode, answers_encode)
dataloader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)    

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
#model
from torch.nn import Transformer
from torch import nn
import torch
import math

class TFModel(nn.Module):
    #ntoken: vocab의 size
    #ninp: embedding할 차원의 크기
    #nhead: num head
    #nhid: feedforward의 차원
    #nlayers: layer의 개수
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TFModel, self).__init__()
        self.transformer = Transformer(ninp, nhead, dim_feedforward=nhid, num_encoder_layers=nlayers, num_decoder_layers=nlayers,dropout=dropout)
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder = nn.Embedding(ntoken, ninp)

        self.pos_encoder_d = PositionalEncoding(ninp, dropout)
        self.encoder_d = nn.Embedding(ntoken, ninp)

        self.ninp = ninp
        self.ntoken = ntoken

        self.linear = nn.Linear(ninp, ntoken)
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt, srcmask, tgtmask, srcpadmask, tgtpadmask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)

        tgt = self.encoder_d(tgt) * math.sqrt(self.ninp)
        tgt = self.pos_encoder_d(tgt)


        output = self.transformer(src.transpose(0,1), tgt.transpose(0,1), srcmask, tgtmask, src_key_padding_mask=srcpadmask, tgt_key_padding_mask=tgtpadmask)
        output = self.linear(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask

In [12]:
device = torch.device('cpu')

lr = 1e-4
model = TFModel(vocab_size+7, 256, 8, 512, 2, 0.2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [14]:
# train and save model
epoch = 50
from tqdm import tqdm
import torch

FILE = "KoGPT.pht"

model.train()
for i in range(epoch):
    batch_loss = 0.0
    progress = tqdm(dataloader)

    for (inputs, dec_inputs, outputs) in progress:
        optimizer.zero_grad()

        src_mask = model.generate_square_subsequent_mask(MAX_LENGTH).to(device)
        src_padding_mask = gen_attention_mask(inputs).to(device)
        tgt_mask = model.generate_square_subsequent_mask(MAX_LENGTH - 1).to(device)
        tgt_padding_mask = gen_attention_mask(dec_inputs).to(device)

        result = model(inputs.to(device), dec_inputs.to(device), src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)
        loss = criterion(result.permute(1, 2, 0), outputs.to(device).long())

        progress.set_description("{:0.3f}".format(loss))
        loss.backward()
        optimizer.step()
        batch_loss += loss

    print(f'epoch: {i+1} | loss: {batch_loss.cpu().item() / len(dataloader)}')

# save for resume
torch.save(model.state_dict(), FILE)

2.086: 100%|██████████| 1/1 [00:00<00:00,  4.56it/s]


epoch: 1 | loss: 2.0855815410614014


2.056: 100%|██████████| 1/1 [00:00<00:00,  9.82it/s]


epoch: 2 | loss: 2.055856466293335


2.020: 100%|██████████| 1/1 [00:00<00:00, 12.48it/s]


epoch: 3 | loss: 2.020354747772217


2.067: 100%|██████████| 1/1 [00:00<00:00, 11.96it/s]


epoch: 4 | loss: 2.0669915676116943


2.003: 100%|██████████| 1/1 [00:00<00:00,  9.64it/s]


epoch: 5 | loss: 2.00283145904541


2.002: 100%|██████████| 1/1 [00:00<00:00, 11.09it/s]


epoch: 6 | loss: 2.002164602279663


1.959: 100%|██████████| 1/1 [00:00<00:00,  9.16it/s]


epoch: 7 | loss: 1.9586443901062012


1.978: 100%|██████████| 1/1 [00:00<00:00, 13.01it/s]


epoch: 8 | loss: 1.9780303239822388


1.946: 100%|██████████| 1/1 [00:00<00:00, 12.61it/s]


epoch: 9 | loss: 1.9457825422286987


1.912: 100%|██████████| 1/1 [00:00<00:00, 11.60it/s]


epoch: 10 | loss: 1.911641001701355


1.914: 100%|██████████| 1/1 [00:00<00:00, 13.20it/s]


epoch: 11 | loss: 1.914300560951233


1.902: 100%|██████████| 1/1 [00:00<00:00, 12.73it/s]


epoch: 12 | loss: 1.9020342826843262


1.886: 100%|██████████| 1/1 [00:00<00:00, 12.78it/s]


epoch: 13 | loss: 1.885556936264038


1.904: 100%|██████████| 1/1 [00:00<00:00, 12.54it/s]


epoch: 14 | loss: 1.9042072296142578


1.837: 100%|██████████| 1/1 [00:00<00:00, 13.63it/s]


epoch: 15 | loss: 1.8371268510818481


1.840: 100%|██████████| 1/1 [00:00<00:00, 13.69it/s]


epoch: 16 | loss: 1.8397167921066284


1.796: 100%|██████████| 1/1 [00:00<00:00, 12.98it/s]


epoch: 17 | loss: 1.795773983001709


1.775: 100%|██████████| 1/1 [00:00<00:00, 13.64it/s]


epoch: 18 | loss: 1.7753276824951172


1.835: 100%|██████████| 1/1 [00:00<00:00, 13.67it/s]


epoch: 19 | loss: 1.8350836038589478


1.778: 100%|██████████| 1/1 [00:00<00:00,  9.74it/s]


epoch: 20 | loss: 1.777685523033142


1.729: 100%|██████████| 1/1 [00:00<00:00, 13.37it/s]


epoch: 21 | loss: 1.728629231452942


1.717: 100%|██████████| 1/1 [00:00<00:00, 11.35it/s]


epoch: 22 | loss: 1.7173019647598267


1.701: 100%|██████████| 1/1 [00:00<00:00, 13.74it/s]


epoch: 23 | loss: 1.7013330459594727


1.622: 100%|██████████| 1/1 [00:00<00:00, 13.40it/s]


epoch: 24 | loss: 1.6222760677337646


1.634: 100%|██████████| 1/1 [00:00<00:00, 13.66it/s]


epoch: 25 | loss: 1.6339713335037231


1.673: 100%|██████████| 1/1 [00:00<00:00, 12.83it/s]


epoch: 26 | loss: 1.6726434230804443


1.666: 100%|██████████| 1/1 [00:00<00:00, 13.24it/s]


epoch: 27 | loss: 1.6662427186965942


1.652: 100%|██████████| 1/1 [00:00<00:00, 13.62it/s]


epoch: 28 | loss: 1.65191650390625


1.605: 100%|██████████| 1/1 [00:00<00:00, 12.94it/s]


epoch: 29 | loss: 1.6052638292312622


1.554: 100%|██████████| 1/1 [00:00<00:00, 13.77it/s]


epoch: 30 | loss: 1.5542080402374268


1.547: 100%|██████████| 1/1 [00:00<00:00, 12.91it/s]


epoch: 31 | loss: 1.5466265678405762


1.541: 100%|██████████| 1/1 [00:00<00:00, 13.28it/s]


epoch: 32 | loss: 1.5413155555725098


1.511: 100%|██████████| 1/1 [00:00<00:00,  9.43it/s]


epoch: 33 | loss: 1.510892391204834


1.539: 100%|██████████| 1/1 [00:00<00:00, 13.65it/s]


epoch: 34 | loss: 1.539451241493225


1.471: 100%|██████████| 1/1 [00:00<00:00, 13.32it/s]


epoch: 35 | loss: 1.470510482788086


1.450: 100%|██████████| 1/1 [00:00<00:00, 12.93it/s]


epoch: 36 | loss: 1.450482726097107


1.415: 100%|██████████| 1/1 [00:00<00:00, 13.56it/s]


epoch: 37 | loss: 1.4150453805923462


1.426: 100%|██████████| 1/1 [00:00<00:00, 13.04it/s]


epoch: 38 | loss: 1.4259854555130005


1.387: 100%|██████████| 1/1 [00:00<00:00, 13.19it/s]


epoch: 39 | loss: 1.3868860006332397


1.401: 100%|██████████| 1/1 [00:00<00:00, 13.32it/s]


epoch: 40 | loss: 1.401314377784729


1.358: 100%|██████████| 1/1 [00:00<00:00, 13.31it/s]


epoch: 41 | loss: 1.3583272695541382


1.360: 100%|██████████| 1/1 [00:00<00:00, 12.89it/s]


epoch: 42 | loss: 1.3596508502960205


1.321: 100%|██████████| 1/1 [00:00<00:00, 13.13it/s]


epoch: 43 | loss: 1.3205617666244507


1.293: 100%|██████████| 1/1 [00:00<00:00, 12.63it/s]


epoch: 44 | loss: 1.2928578853607178


1.264: 100%|██████████| 1/1 [00:00<00:00, 12.62it/s]


epoch: 45 | loss: 1.2644387483596802


1.249: 100%|██████████| 1/1 [00:00<00:00, 10.03it/s]


epoch: 46 | loss: 1.2491650581359863


1.238: 100%|██████████| 1/1 [00:00<00:00, 13.28it/s]


epoch: 47 | loss: 1.238361120223999


1.264: 100%|██████████| 1/1 [00:00<00:00, 13.12it/s]


epoch: 48 | loss: 1.2641711235046387


1.265: 100%|██████████| 1/1 [00:00<00:00, 13.25it/s]


epoch: 49 | loss: 1.264575481414795


1.255: 100%|██████████| 1/1 [00:00<00:00, 13.69it/s]

epoch: 50 | loss: 1.254701018333435



