<a href="https://colab.research.google.com/github/CAVASOL/aiffel_quest/blob/main/Exploration_quest/exploration_5/xp5_recap_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A Chatbot with Korean

**Creator: Yeon Kim**

**Goal**

* Recap `xp5_project.ipynb` with `PyTorch`
* Transformer chatbot model implementation

**Index**

    Set up
    Transformer Model
    Import Dataset
    Data field settings
    Train Transformer model
    Run Transformer chatbot for real sentence

### Set up

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

### Transformer Model

In [None]:
class PositionalEncoder(nn.Module):

    def __init__(self, position, d_model):

        super().__init__()

        self.d_model = d_model

        pe = torch.zeros(position, d_model)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        pe = pe.to(device)

        for pos in range(position):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos /
                                          (10000 ** ((2 * i)/d_model)))

        self.pe = pe.unsqueeze(0)

        self.pe.requires_grad = False

    def forward(self, x):

        ret = math.sqrt(self.d_model)*x + self.pe[:, :x.size(1)]
        return ret

In [None]:
def scaled_dot_product_attention(query, key, value, mask):

  matmul_qk = torch.matmul(query, torch.transpose(key,2,3))

  depth = key.shape[-1]
  logits = matmul_qk / math.sqrt(depth)

  if mask is not None:
    logits += (mask * -1e9)

  attention_weights = F.softmax(logits, dim=-1)

  output = torch.matmul(attention_weights, value)

  return output, attention_weights

In [None]:
class MultiheadAttention(nn.Module):

    def __init__(self, d_model, num_heads):
        super(MultiheadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads

        assert d_model % self.num_heads == 0

        self.depth = int(d_model/self.num_heads)
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def split_heads(self, inputs, batch_size):
      inputs = torch.reshape(
          inputs, (batch_size, -1, self.num_heads, self.depth))
      return torch.transpose(inputs, 1,2)

    def forward(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = query.shape[0]
        query = self.q_linear(query)
        key = self.k_linear(key)
        value = self.v_linear(value)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask)
        scaled_attention = torch.transpose(scaled_attention, 1,2)

        concat_attention = torch.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))

        outputs = self.out(concat_attention)
        return outputs

In [None]:
class FeedForward(nn.Module):

    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, attention):
        outputs = self.linear_1(attention)
        outputs = F.relu(outputs)
        outputs = self.linear_2(outputs)
        return outputs

In [None]:
class EncoderBlock(nn.Module):

      def __init__(self, d_ff, d_model, num_heads, dropout):

        super(EncoderBlock, self).__init__()

        self.attn = MultiheadAttention(d_model, num_heads)
        self.dropout_1 = nn.Dropout(dropout)
        self.norm_1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.dropout_2 = nn.Dropout(dropout)
        self.norm_2 = nn.LayerNorm(d_model)

      def forward(self, inputs, padding_mask):

        attention = self.attn({'query': inputs, 'key': inputs, 'value': inputs, 'mask': padding_mask})
        attention = self.dropout_1(attention)
        attention = self.norm_1(inputs + attention)
        outputs = self.ff(attention)
        outputs = self.dropout_2(outputs)
        outputs = self.norm_2(attention + outputs)

        return outputs

In [None]:
class Encoder(nn.Module):

      def __init__(self,text_embedding_vectors, vocab_size, num_layers, d_ff, d_model, num_heads, dropout):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_layers = num_layers
        self.embb = nn.Embedding(text_embedding_vectors, d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.PE = PositionalEncoder(vocab_size, d_model)
        self.encoder_block = EncoderBlock(d_ff, d_model, num_heads, dropout)

      def forward(self, x, padding_mask):
        emb = self.embb(x)
        emb *= math.sqrt(self.d_model)
        emb = self.PE(emb)
        output = self.dropout_1(emb)

        for i in range(self.num_layers):
          output = self.encoder_block(output, padding_mask)

        return output

In [None]:
class DecoderBlock(nn.Module):

      def __init__(self, d_ff, d_model, num_heads, dropout):
        super(DecoderBlock, self).__init__()

        self.attn = MultiheadAttention(d_model, num_heads)
        self.attn_2 = MultiheadAttention(d_model, num_heads)
        self.dropout_1 = nn.Dropout(dropout)
        self.norm_1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        self.norm_2 = nn.LayerNorm(d_model)
        self.norm_3 = nn.LayerNorm(d_model)

      def forward(self, inputs, enc_outputs, padding_mask, look_ahead_mask):
        attention1 = self.attn({'query': inputs, 'key': inputs, 'value': inputs, 'mask': look_ahead_mask})
        attention1 = self.norm_1(inputs + attention1)
        attention2 = self.attn_2({'query': attention1, 'key': enc_outputs, 'value': enc_outputs, 'mask': padding_mask})
        attention2 = self.dropout_1(attention2)
        attention2 = self.norm_2(attention1 + attention2)

        outputs = self.ff(attention2)
        outputs = self.dropout_3(outputs)
        outputs = self.norm_3(attention2 + outputs)

        return outputs

In [None]:
class Decoder(nn.Module):

      def __init__(self,text_embedding_vectors,  vocab_size, num_layers, d_ff, d_model, num_heads, dropout):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_layers = num_layers
        self.embb = nn.Embedding(text_embedding_vectors, d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.PE = PositionalEncoder(vocab_size, d_model)
        self.decoder_block = DecoderBlock(d_ff, d_model, num_heads, dropout)

      def forward(self, enc_output, dec_input, padding_mask, look_ahead_mask):
        emb = self.embb(dec_input)
        emb *= math.sqrt(self.d_model)
        emb = self.PE(emb)
        output = self.dropout_1(emb)
        for i in range(self.num_layers):
          output = self.decoder_block(output, enc_output, padding_mask, look_ahead_mask)

        return output

In [None]:
class transformer(nn.Module):

    def __init__(self, text_embedding_vectors, vocab_size, num_layers, d_ff, d_model, num_heads, dropout):
        self.vocab_size = vocab_size
        super(transformer, self).__init__()
        self.enc_outputs = Encoder(text_embedding_vectors, vocab_size, num_layers, d_ff, d_model, num_heads, dropout)
        self.dec_outputs = Decoder(text_embedding_vectors, vocab_size, num_layers, d_ff, d_model, num_heads, dropout)
        self.output = nn.Linear(d_model, text_embedding_vectors)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, dec_input):
        enc_input = input
        dec_input = dec_input
        enc_padding_mask = create_padding_mask(enc_input)
        dec_padding_mask = create_padding_mask(enc_input)
        look_ahead_mask = create_look_ahead_mask(dec_input)

        enc_output = self.enc_outputs(enc_input, enc_padding_mask)
        dec_output = self.dec_outputs(enc_output, dec_input, dec_padding_mask, look_ahead_mask)
        output = self.output(dec_output)
        return output

### Import Dataset

In [None]:
pip install soynlp

Note: you may need to restart the kernel to use updated packages.


In [None]:
import re
import os
import pandas as pd
import urllib.request
from torchtext import data, datasets
import time

In [None]:
train_data = pd.read_csv('/aiffel/data/ChatbotData .csv')

print("Data shape:", train_data.shape)

Data shape: (11823, 3)


In [None]:
display(train_data.head(5))

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [None]:
from soynlp.tokenizer import LTokenizer

tokenizer = LTokenizer()

In [None]:
tokenizer("내일 역 앞의 식당에서 밥 먹으러 나갈래 ?")

['내일', '역', '앞의', '식당에서', '밥', '먹으러', '나갈래', '?']

In [None]:
VOCAB_SIZE = 40

### Data field settings

In [None]:
pip install torchtext==0.8.1

[attributeerror: module torchtext.data has no attribute field](https://itsourcecode.com/attributeerror/attributeerror-module-torchtext-data-has-no-attribute-field-solved/)

torchtext.data.Pipeline -> torchtext.legacy.data.Pipeline  
torchtext.data.Batch -> torchtext.legacy.data.Batch  
torchtext.data.Example -> torchtext.legacy.data.Example  
torchtext.data.Field -> torchtext.legacy.data.Field  
torchtext.data.Iterator -> torchtext.legacy.data.Iterator  
torchtext.data.Dataset -> torchtext.legacy.data.Dataset  

In [None]:
from torchtext.data import Field

Q = data.Field(
    sequential=True,
    use_vocab=True,
    lower=True,
    tokenize=tokenizer,
    batch_first=True,
    init_token="<SOS>",
    eos_token="<EOS>",
    fix_length=VOCAB_SIZE
)

A = data.Field(
    sequential=True,
    use_vocab=True,
    lower=True,
    tokenize=tokenizer,
    batch_first=True,
    init_token="<SOS>",
    eos_token="<EOS>",
    fix_length=VOCAB_SIZE
)

In [None]:
trainset = data.TabularDataset(
        train_data, format='csv', skip_header=False,
        fields=[('Q', Q),('A', A)])

In [None]:
print(vars(train_data[2]))

In [None]:
print('Number of sample for train_data : {}'.format(len(train_data)))

In [None]:
Q.build_vocab(trainset.Q, trainset.A, min_freq = 2)
A.vocab = Q.vocab

In [None]:
PAD_TOKEN, START_TOKEN, END_TOKEN, UNK_TOKEN = Q.vocab.stoi['<pad>'], Q.vocab.stoi['<SOS>'], Q.vocab.stoi['<EOS>'], Q.vocab.stoi['<unk>']

In [None]:
VOCAB_SIZE = VOCAB_SIZE
text_embedding_vectors = len(Q.vocab)
NUM_LAYERS = 4
D_FF = 512
D_MODEL = 128
NUM_HEADS = 4
DROPOUT = 0.3
BATCH_SIZE = 64

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# train_iter = data.BucketIterator(
#         trainset, batch_size=BATCH_SIZE,
#         shuffle=True, repeat=False, sort=False, device = device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
print(text_embedding_vectors)
net = transformer(text_embedding_vectors = text_embedding_vectors,
                  vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, d_ff=D_FF, d_model=D_MODEL,
                  num_heads=NUM_HEADS, dropout=DROPOUT)

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:

        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)

net.train()

net.apply(weights_init)

print("Completed")

In [None]:
criterion = nn.CrossEntropyLoss()

learning_rate = 2e-4
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [None]:
def create_padding_mask(x):
      input_pad = 0
      mask = (x == input_pad).float()
      mask = mask.unsqueeze(1).unsqueeze(1)

      return mask

In [None]:
def create_look_ahead_mask(x):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  seq_len = x.shape[1]
  look_ahead_mask = torch.ones(seq_len, seq_len)
  look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1).to(device)

  padding_mask = create_padding_mask(x).to(device)
  return torch.maximum(look_ahead_mask, padding_mask)

### Train Transformer model

In [None]:
from IPython.display import clear_output
import datetime

def train_model(net, train_iter, criterion, optimizer, num_epochs):
    start_time = time.time()

    ntokens = len(Q.vocab.stoi)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)
    print('-----start-------')
    net.to(device)
    epoch_ = []
    epoch_train_loss = []

    torch.backends.cudnn.benchmark = True

    net.train()

    best_epoch_loss = float("inf")
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        cnt = 0

        for batch in train_loader:
            questions = batch.question
            answers = batch.answer

            questions = questions.to(device)
            answers = answers.to(device)

            with torch.set_grad_enabled(True):
                preds = net(questions, answers)
                pad = torch.LongTensor(answers.size(0), 1).fill_(PAD_TOKEN).to(device)
                preds_id = torch.transpose(preds, 1, 2)
                outputs = torch.cat((answers[:, 1:], pad), -1)
                optimizer.zero_grad()
                loss = criterion(preds_id, outputs)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
                optimizer.step()
                epoch_loss += loss.item()
                cnt += 1

        epoch_loss = epoch_loss / cnt
        if not best_epoch_loss or epoch_loss < best_epoch_loss:
            if not os.path.isdir("snapshot"):
                os.makedirs("snapshot")
            torch.save(net.state_dict(), './snapshot/transformermodel.pt')
            best_epoch_loss = epoch_loss

        epoch_.append(epoch)
        epoch_train_loss.append(epoch_loss)
        print('Epoch {0}/{1} Average Loss: {2}'.format(epoch + 1, num_epochs, epoch_loss))
        clear_output(wait=True)

    fig = plt.figure(figsize=(8, 8))
    fig.set_facecolor('white')
    ax = fig.add_subplot()

    ax.plot(epoch_, epoch_train_loss, label='Average loss')
    ax.legend()
    ax.set_xlabel('epoch')
    ax.set_ylabel('loss')

    plt.show()
    end_time = time.time() - start_time
    times = str(datetime.timedelta(seconds=end_time)).split(".")
    print('Finished in {0}'.format(times[0]))

In [None]:
num_epochs = 100
train_model(net, train_iter, criterion, optimizer, num_epochs=num_epochs)

In [None]:
net_trained = transformer(text_embedding_vectors = text_embedding_vectors, vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, d_ff=D_FF, d_model=D_MODEL, num_heads=NUM_HEADS, dropout=DROPOUT).to(device)
net_trained.load_state_dict(torch.load('./snapshot/transformermodel.pt'))

### Run Transformer chatbot for real sentence

In [None]:
def stoi(vocab, token, max_len):
  #
  indices=[]
  token.extend(['<pad>'] * (max_len - len(token)))
  for string in token:
    if string in vocab:
      i = vocab.index(string)
    else:
      i = 0
    indices.append(i)
  return torch.LongTensor(indices).unsqueeze(0)

def itos(vocab, indices):
  text = []
  for i in indices.cpu()[0]:
    if i==1:
      break
    else:
      if i not in [PAD_TOKEN, START_TOKEN, END_TOKEN]:
          if i != UNK_TOKEN:
              text.append(vocab[i])
          else:
              text.append('??')
  return " ".join(text)

In [None]:
def evaluate(input_sentence):
    VOCAB_SIZE = 40
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = LTokenizer()
    token = tokenizer(input_sentence)
    input = stoi(Q.vocab.itos, token, VOCAB_SIZE).to(device)
    output = torch.LongTensor(1, 1).fill_(START_TOKEN).to(device)
    for i in range(VOCAB_SIZE):
        predictions = net_trained(input, output)
        predictions = predictions[:, -1:, :]
        predicted_id = torch.argmax(predictions[:,:,3:], axis=-1) + 3
        if predicted_id == END_TOKEN:
            predicted_id = predicted_id
            break
        output = torch.cat((output, predicted_id),-1)
    return output

In [None]:
def predict(sentence):
  out = evaluate(sentence)
  out_text = itos(Q.vocab.itos, out)
  print('input = [{0}]'.format(sentence))
  print('output = [{0}]'.format(out_text))
  return out_text

out = predict('우리 내일 같이 영화 볼래?')

In [None]:
out = predict('그 영화 너무 별로더라')