# Chat with U
> seq2seq 모델과 Tensorflow로 제작한 Chatbot 튜토리얼입니다.

### Dataset Download
Chatbot 학습에 필요한 Dataset을 불러옵니다.
- [songys/Chatbot_data](https://github.com/songys/Chatbot_data.git)
- 문답 페어 11,876개
- `Q`: 질문
- `A`: 답변

In [None]:
!git clone https://github.com/songys/Chatbot_data.git

In [None]:
import pandas as pd
corpus = pd.read_csv('/content/Chatbot_data/ChatbotData.csv')

In [None]:
corpus.head()

In [None]:
corpus.Q.head()

In [None]:
corpus.A.head()

In [None]:
# dataset을 list 형태로 변환
q_list = []
a_list = []

for q, a in zip(corpus.Q, corpus.A):
    q_list.append(q)
    a_list.append(a)

In [None]:
# RAM 용량 제한으로 인한 데이터 개수 줄이기
q_list = q_list[:1000]
a_list = a_list[:1000]

In [None]:
q_list[:5]

### Preprocess
형태소 분석
- Konlpy의 Okt 분석기를 사용합니다.
   

토큰 추가
- `SOS`: Start Of Sentence
- `EOS`: End Of Sentence

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
sentence = "오늘은 먹고싶은게 딱히 없지만, 딱새우는 먹고싶어요"
okt.morphs(sentence)

In [None]:
# 형태소 분석으로 분할된 단어들을 공백 기준으로 분리
def process_morph(sentence):
    return ' '.join(okt.morphs(sentence))

In [None]:
# 질문과 답변을 분리해서 처리
def morph_and_token(sentence, is_question=True):
    sentence = process_morph(sentence)
    if is_question:
        return sentence
    else:
        return ('<SOS> ' + sentence, sentence + ' <EOS>')

In [None]:
def preprocess(q_list, a_list):
    questions = []
    answer_input = []
    answer_output = []

    for q in q_list:
        question = morph_and_token(q, is_question=True)
        questions.append(question)

    for a in a_list:
        input_, output_ = morph_and_token(a, is_question=False)
        answer_input.append(input_)
        answer_output.append(output_)

    return questions, answer_input, answer_output

### Dataset Split
Encoder, Decoder의 관점으로 Dataset을 재구성합니다.
- `questions`: Encoder input  
- `answer_input`: Decoder input  
- `answer_output`: Decoder output

In [None]:
questions, answer_input, answer_output = preprocess(q_list, a_list)

In [None]:
questions[:5]

In [None]:
answer_input[:5]

In [None]:
answer_output[:5]

In [None]:
# vocab 제작에 사용
all_sentences = questions + answer_input + answer_output

### Tokenization
- Vocab을 만들어줍니다.
- Text를 Sequence로 Encoding합니다.
- Padding으로 문장의 길이를 일정하게 맞춰줍니다.

In [None]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 토큰의 옵션 정의
tokenizer = Tokenizer(filters='', lower=False, oov_token='<OOV>')

In [None]:
# vocab 만들기
tokenizer.fit_on_texts(all_sentences)

In [None]:
# vocab 확인해보기
for word, index in tokenizer.word_index.items():
    print(f'{word}\t\t\t{index}')
    if index > 20:
        break

In [None]:
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [None]:
VOCAB_SIZE

In [None]:
# Text to Sequence Encoding
questions_sequence = tokenizer.texts_to_sequences(questions)
answer_input_sequence = tokenizer.texts_to_sequences(answer_input)
answer_output_sequence = tokenizer.texts_to_sequences(answer_output)

In [None]:
questions[:5]

In [None]:
questions_sequence[:5]

In [None]:
# Padding Hyperparameter
# 문장을 잘라낼때 뒷부분부터 잘라주고, Padding을 해줄때 뒷부분부터
MAX_LENGTH = 30
TRUCATING = 'post'
PADDING = 'post'

In [None]:
questions_padded = pad_sequences(questions_sequence, maxlen=MAX_LENGTH, truncating=TRUCATING, padding=PADDING)
answer_input_padded = pad_sequences(answer_input_sequence, maxlen=MAX_LENGTH, truncating=TRUCATING, padding=PADDING)
answer_output_padded = pad_sequences(answer_output_sequence, maxlen=MAX_LENGTH, truncating=TRUCATING, padding=PADDING)

In [None]:
questions_padded[:5]

In [None]:
questions_padded.shape, answer_input_padded.shape, answer_output_padded.shape

### Vectorization
- 각 단어들을 One-Hot Encoding 변환
- Vocab의 index를 참조해 다시 text 형태로 변환 (예측 과정에서 호출)

In [None]:
#One-Hot Encoding
def convert_to_one_hot(padded):
    one_hot_vector = np.zeros((len(padded), MAX_LENGTH, VOCAB_SIZE))

    for i, sequence in enumerate(padded):
        for j, index in enumerate(sequence):
            one_hot_vector[i, j, index] = 1
    
    return one_hot_vector    

In [None]:
answer_input_one_hot = convert_to_one_hot(answer_input_padded)
answer_output_one_hot = convert_to_one_hot(answer_output_padded)

In [None]:
answer_input_one_hot[0].shape, answer_output_one_hot[0].shape

In [None]:
# 예측 값을 단어사전에서 찾아와 문자열로 변환
def index_to_text(indexs, end_token):
    sentence = ' '

    for i in indexs:
        if i == end_token:
            break;

        if i > 0 and tokenizer.index_word[i] is not None:
            sentence += tokenizer.index_word[i]
        else:
            sentence += ''

        sentence += ' '
    return sentence

### Generate Model
- Encoder 정의
- Decoder 정의

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Model

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, input_length=time_steps)
        self.dropout = Dropout(0.2)
        self.lstm = LSTM(units, return_state=True)

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x.hidden_state, call_state = self.lstm(x)
        return [hidden_state, cell_state]

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, input_length=time_steps)
        self.dropout = Dropout(0.2)
        self.lstm = LSTM(units, return_state=True, return_sequences=True, )

    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x, hidden_state, cell_state = self.lstm(x, initial_state=initial_state)
        x = self.dense(x)
        return x, hidden_state, cell_state

### Train