In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [3]:
questions = []
for sentence in train_data['Q']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)

In [4]:
answers = []
for sentence in train_data['A']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)

In [5]:
print(questions[:5])
print(answers[:5])

['12시 땡 !', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요 .', '위로해 드립니다 .', '여행은 언제나 좋죠 .', '여행은 언제나 좋죠 .', '눈살이 찌푸려지죠 .']


In [6]:
import sentencepiece as spm

In [7]:
with open('all.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(questions))
    f.write('\n'.join(answers))

In [10]:
corpus = "all.txt"
prefix = "chatbot"
vocab_size = 16000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

In [11]:
vocab_file = "chatbot.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)
line = "안녕하세요 만나서 반갑습니다"
pieces = vocab.encode_as_pieces(line)
ids = vocab.encode_as_ids(line)


print(line)
print(pieces)
print(ids)

안녕하세요 만나서 반갑습니다
['▁안녕하세요', '▁만나서', '▁반갑습니다']
[4626, 1930, 8499]


In [12]:
# 최대 길이를 40으로 정의
MAX_LENGTH = 40

START_TOKEN = [2]
END_TOKEN = [3]

# 토큰화 / 정수 인코딩 / 시작 토큰과 종료 토큰 추가 / 패딩
def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []

  for (sentence1, sentence2) in zip(inputs, outputs):
    # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
    zeros1 = np.zeros(MAX_LENGTH, dtype=int)
    zeros2 = np.zeros(MAX_LENGTH, dtype=int)
    sentence1 = START_TOKEN + vocab.encode_as_ids(sentence1) + END_TOKEN
    zeros1[:len(sentence1)] = sentence1[:MAX_LENGTH]

    sentence2 = START_TOKEN + vocab.encode_as_ids(sentence2) + END_TOKEN
    zeros2[:len(sentence2)] = sentence2[:MAX_LENGTH]

    tokenized_inputs.append(zeros1)
    tokenized_outputs.append(zeros2)
  return tokenized_inputs, tokenized_outputs

In [13]:
questions_encode, answers_encode = tokenize_and_filter(questions, answers)

In [14]:
print(questions_encode[0])
print(answers_encode[0])

[    2  5566 14968  3210   111     3     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[   2 5192  217 5936    7    3    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [15]:
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, questions, answers):
        questions = np.array(questions)
        answers = np.array(answers)
        self.inputs = questions
        self.dec_inputs = answers[:,:-1]
        self.outputs = answers[:,1:]
        self.length = len(questions)
    
    def __getitem__(self,idx):
        return (self.inputs[idx], self.dec_inputs[idx], self.outputs[idx])

    def __len__(self):
        return self.length

BATCH_SIZE = 64
dataset = SequenceDataset(questions_encode, answers_encode)
dataloader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)

In [27]:
from torch.nn import Transformer
from torch import nn
import torch
import math

class TFModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TFModel, self).__init__()
        self.transformer = Transformer(ninp, nhead, dim_feedforward=nhid, num_encoder_layers=nlayers, num_decoder_layers=nlayers,dropout=dropout)
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder = nn.Embedding(ntoken, ninp)

        self.pos_encoder_d = PositionalEncoding(ninp, dropout)
        self.encoder_d = nn.Embedding(ntoken, ninp)

        self.ninp = ninp
        self.ntoken = ntoken

        self.linear = nn.Linear(ninp, ntoken)
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt, srcmask, tgtmask, srcpadmask, tgtpadmask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)

        tgt = self.encoder_d(tgt) * math.sqrt(self.ninp)
        tgt = self.pos_encoder_d(tgt)


        output = self.transformer(src.transpose(0,1), tgt.transpose(0,1), srcmask, tgtmask, src_key_padding_mask=srcpadmask, tgt_key_padding_mask=tgtpadmask)
        output = self.linear(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask

In [478]:
device = torch.device("cuda", index=1)

lr = 1e-4
model = TFModel(vocab_size+7, 256, 8, 512, 2, 0.1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [479]:
epoch = 30
from tqdm import tqdm

model.train()
for i in range(epoch):
    batchloss = 0.0
    progress = tqdm(dataloader)
    for (inputs, dec_inputs, outputs) in progress:
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(MAX_LENGTH).to(device)
        src_padding_mask = gen_attention_mask(inputs).to(device)
        tgt_mask = model.generate_square_subsequent_mask(MAX_LENGTH-1).to(device)
        tgt_padding_mask = gen_attention_mask(dec_inputs).to(device)

        result = model(inputs.to(device), dec_inputs.to(device), src_mask, tgt_mask, src_padding_mask,tgt_padding_mask)
        loss = criterion(result.permute(1,2,0), outputs.to(device).long())
        progress.set_description("{:0.3f}".format(loss))
        loss.backward()
        optimizer.step()
        batchloss += loss
    print("epoch:",i+1,"|","loss:",batchloss.cpu().item() / len(dataloader))

0.959: 100%|██████████| 185/185 [00:09<00:00, 20.16it/s]


epoch: 1 | loss: 1.8828087059227196


0.942: 100%|██████████| 185/185 [00:08<00:00, 20.82it/s]


epoch: 2 | loss: 0.9739796406513935


0.923: 100%|██████████| 185/185 [00:08<00:00, 22.55it/s]


epoch: 3 | loss: 0.9115835653769003


0.912: 100%|██████████| 185/185 [00:08<00:00, 21.44it/s]


epoch: 4 | loss: 0.8829720986855997


0.959: 100%|██████████| 185/185 [00:08<00:00, 21.49it/s]


epoch: 5 | loss: 0.861990686365076


0.868: 100%|██████████| 185/185 [00:08<00:00, 20.68it/s]


epoch: 6 | loss: 0.8425723514041386


0.904: 100%|██████████| 185/185 [00:08<00:00, 22.06it/s]


epoch: 7 | loss: 0.8240846376161317


0.839: 100%|██████████| 185/185 [00:08<00:00, 21.13it/s]


epoch: 8 | loss: 0.8053272144214527


0.861: 100%|██████████| 185/185 [00:08<00:00, 21.16it/s]


epoch: 9 | loss: 0.7861398232949747


0.793: 100%|██████████| 185/185 [00:08<00:00, 21.97it/s]


epoch: 10 | loss: 0.7659728179106842


0.676: 100%|██████████| 185/185 [00:08<00:00, 21.00it/s]


epoch: 11 | loss: 0.7451116613439611


0.636: 100%|██████████| 185/185 [00:08<00:00, 21.84it/s]


epoch: 12 | loss: 0.7248309986011402


0.743: 100%|██████████| 185/185 [00:09<00:00, 19.96it/s]


epoch: 13 | loss: 0.7038217080605996


0.596: 100%|██████████| 185/185 [00:08<00:00, 20.68it/s]


epoch: 14 | loss: 0.6829455813846073


0.667: 100%|██████████| 185/185 [00:08<00:00, 21.80it/s]


epoch: 15 | loss: 0.6613200007258235


0.633: 100%|██████████| 185/185 [00:08<00:00, 21.33it/s]


epoch: 16 | loss: 0.6399022695180532


0.615: 100%|██████████| 185/185 [00:08<00:00, 21.90it/s]


epoch: 17 | loss: 0.6175418750659839


0.601: 100%|██████████| 185/185 [00:08<00:00, 22.43it/s]


epoch: 18 | loss: 0.5956226245777027


0.630: 100%|██████████| 185/185 [00:09<00:00, 19.72it/s]


epoch: 19 | loss: 0.5732904382654138


0.496: 100%|██████████| 185/185 [00:08<00:00, 21.69it/s]


epoch: 20 | loss: 0.5513939110008446


0.507: 100%|██████████| 185/185 [00:08<00:00, 21.30it/s]


epoch: 21 | loss: 0.5286682541305955


0.480: 100%|██████████| 185/185 [00:08<00:00, 21.46it/s]


epoch: 22 | loss: 0.5066682970201647


0.460: 100%|██████████| 185/185 [00:08<00:00, 21.40it/s]


epoch: 23 | loss: 0.48444976806640627


0.428: 100%|██████████| 185/185 [00:08<00:00, 22.04it/s]


epoch: 24 | loss: 0.4617009755727407


0.467: 100%|██████████| 185/185 [00:08<00:00, 22.28it/s]


epoch: 25 | loss: 0.43948953989389783


0.500: 100%|██████████| 185/185 [00:08<00:00, 21.45it/s]


epoch: 26 | loss: 0.4179861945074958


0.331: 100%|██████████| 185/185 [00:08<00:00, 22.72it/s]


epoch: 27 | loss: 0.39681738776129644


0.342: 100%|██████████| 185/185 [00:08<00:00, 20.57it/s]


epoch: 28 | loss: 0.37577456912478885


0.333: 100%|██████████| 185/185 [00:08<00:00, 21.42it/s]


epoch: 29 | loss: 0.35584098197318415


0.345: 100%|██████████| 185/185 [00:08<00:00, 21.59it/s]

epoch: 30 | loss: 0.33466512319203967





In [352]:
torch.save(model.state_dict(), "chatbot.pth")

In [390]:
def preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

def evaluate(sentence):
    sentence = preprocess_sentence(sentence)
    input = torch.tensor([START_TOKEN + vocab.encode_as_ids(sentence) + END_TOKEN]).to(device)
    output = torch.tensor([START_TOKEN]).to(device)

    # 디코더의 예측 시작
    model.eval()
    for i in range(MAX_LENGTH):
        src_mask = model.generate_square_subsequent_mask(input.shape[1]).to(device)
        tgt_mask = model.generate_square_subsequent_mask(output.shape[1]).to(device)

        src_padding_mask = gen_attention_mask(input).to(device)
        tgt_padding_mask = gen_attention_mask(output).to(device)

        predictions = model(input, output, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask).transpose(0,1)
        # 현재(마지막) 시점의 예측 단어를 받아온다.
        predictions = predictions[:, -1:, :]
        predicted_id = torch.LongTensor(torch.argmax(predictions.cpu(), axis=-1))


        # 만약 마지막 시점의 예측 단어가 종료 토큰이라면 예측을 중단
        if torch.equal(predicted_id[0][0], torch.tensor(END_TOKEN[0])):
            break

        # 마지막 시점의 예측 단어를 출력에 연결한다.
        # 이는 for문을 통해서 디코더의 입력으로 사용될 예정이다.
        output = torch.cat([output, predicted_id.to(device)], axis=1)

    return torch.squeeze(output, axis=0).cpu().numpy()

def predict(sentence):
    prediction = evaluate(sentence)
    predicted_sentence = vocab.Decode(list(map(int,[i for i in prediction if i < vocab_size+7])))

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence

In [499]:
model.load_state_dict(torch.load("chatbot.pth"))
result = predict("난 뭘 해야 할까?")

Input: 난 뭘 해야 할까?
Output: 가장 중요한 것 같아요 .
