In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

import re

## Data 호출하기
- pandas 를 이용해서 데이터를 불러오고, 데이터 전처리 및 tokenize를 진행

In [2]:
df = pd.read_csv("dialogs.txt",delimiter = "\t", encoding = "UTF-8", names = ["question", "answer"])
df.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [21]:
def preprocess_string(sentence):
    # 줄임말 늘려놓기
    sentence = re.sub(pattern = "i'm",    repl = "i am",      string = sentence)  
    sentence = re.sub(pattern = "you're", repl = "you are",   string = sentence)
    sentence = re.sub(pattern = "it's",   repl = "it is",     string = sentence)
    sentence = re.sub(pattern = "he's",   repl = "he is",     string = sentence)
    sentence = re.sub(pattern = "she's",  repl = "she is",    string = sentence)
    
    sentence = re.sub(pattern = "where's",repl = "where is",  string = sentence)
    sentence = re.sub(pattern = "what's", repl = "what is",   string = sentence)
    sentence = re.sub(pattern = "that's", repl = "that is",   string = sentence)
    
    sentence = re.sub(pattern = "'ve",    repl = " have",     string = sentence)
    sentence = re.sub(pattern = "'ll",    repl = " will",     string = sentence)
        
    sentence = re.sub(pattern = r"([?!,.])",    repl = r" \1",     string = sentence) # !?,. 특수문자 글자와 뗴어놓기
    sentence = re.sub(pattern = r"([' ']+)",    repl = r" ",       string = sentence) # 띄어쓰기 중복되는거 하나로
    sentence = re.sub(pattern = r"([^A-z1-9?!,.]+)",    repl = r" ",       string = sentence) # 영, 숫자 및 !?,. 빼곤 다 공백으로 처리
    
    sentence = sentence.strip() # 최종적으로 문자열 양옆의 공백 처리
    sentence = "<start> " + sentence + " <end>"
    
    return sentence.split() 

In [22]:
df["pro_que"] = df['question'].map(lambda x: preprocess_string(x))
df["pro_ans"] = df['answer'].map(lambda x: preprocess_string(x))
df.head()

Unnamed: 0,question,answer,pro_que,pro_ans,inputs_token,targets_token
0,"hi, how are you doing?",i'm fine. how about yourself?,"[<start>, hi, ,, how, are, you, doing, ?, <end>]","[<start>, i, am, fine, ., how, about, yourself...","[1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0, 0, 0, 0, ...","[1, 10, 11, 12, 13, 5, 14, 15, 9, 2, 0, 0, 0, ..."
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,"[<start>, i, am, fine, ., how, about, yourself...","[<start>, i, am, pretty, good, ., thanks, for,...","[1, 10, 11, 12, 13, 5, 14, 15, 9, 2, 0, 0, 0, ...","[1, 10, 11, 16, 17, 13, 18, 19, 20, 13, 2, 0, ..."
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,"[<start>, i, am, pretty, good, ., thanks, for,...","[<start>, no, problem, ., so, how, have, you, ...","[1, 10, 11, 16, 17, 13, 18, 19, 20, 13, 2, 0, ...","[1, 21, 22, 13, 23, 5, 24, 7, 25, 9, 2, 0, 0, ..."
3,no problem. so how have you been?,i've been great. what about you?,"[<start>, no, problem, ., so, how, have, you, ...","[<start>, i, have, been, great, ., what, about...","[1, 21, 22, 13, 23, 5, 24, 7, 25, 9, 2, 0, 0, ...","[1, 10, 24, 25, 26, 13, 27, 14, 7, 9, 2, 0, 0,..."
4,i've been great. what about you?,i've been good. i'm in school right now.,"[<start>, i, have, been, great, ., what, about...","[<start>, i, have, been, good, ., i, am, in, s...","[1, 10, 24, 25, 26, 13, 27, 14, 7, 9, 2, 0, 0,...","[1, 10, 24, 25, 17, 13, 10, 11, 28, 29, 30, 31..."


### Data Tokenize and Padding
- 데이터를 숫자로 Tokenize, 이후에 길이를 맞추기 위해 padding을 해준다.

In [5]:
word_to_token = {"<padding>" : 0, "<start>" : 1, "<end>" : 2}
max_length = 25

def tokenize(sentence, training = True):
    tokenized_sentence = []
    
    if training:
        for word in sentence:
            try :
                tokenized_sentence.append(word_to_token[word])
            except:
                word_to_token[word] = len(word_to_token)
                tokenized_sentence.append(word_to_token[word])
    else:
        for word in sentence:
            try :
                tokenized_sentence.append(word_to_token[word])
            except:
                print("<Error!> : There is no Token for "+word+"! Please try again")
                raise NotImplementedError
    
    return tokenized_sentence

def padding(tokenized_sentence, max_length):
    if len(tokenized_sentence) > max_length:
        print("<Error!> : max_length is small then sentence! please input bigger max_length !")
        raise NotImplementedError
    
    while len(tokenized_sentence) < max_length:
        tokenized_sentence.append(0)
    return tokenized_sentence

In [6]:
df['inputs_token'] = df['pro_que'].map(tokenize).map(lambda x: padding(x, max_length))
df['targets_token'] = df['pro_ans'].map(tokenize).map(lambda x: padding(x, max_length))

In [7]:
token_to_word = {word_to_token[i] : i for i in word_to_token}

## Model 만들기
- Encoder Model
- Attention Model
- Decoder Model

### Encoder 모델 정의
- Encoder는 입력 sequence 를 받아들이고, hidden state 및 output을 생성한다.
- 즉 채팅시에 상대방이 입력하는 입력값

In [8]:
class Encoder(tf.keras.Model):
    def __init__(self, 
                 units, # encoder로 들어오는 unit의 개수
                 vocab_size, # 임베딩전 단어의 개수
                 embedding_units): # 임베딩 한 유닛의 개수
        super(Encoder, self).__init__()
        self.enc_units = units 
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_units) 
                
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
    def __call__(self, x, hidden = None):
        x = self.embedding(x)
        if hidden == None:
            hidden = tf.zeros([x.shape[0],self.enc_units])
        output, state = self.gru(x, initial_state = hidden)
        return output, state

### Attention 모델 정의
- Attention은 Encoder의 output(hidden state) 들에서 주의해야할 부분을 추출해 Decoder에 입력으로써 사용된다.
- RNN의 특징인 gradient banishing problem 을 제거하기 위한 방법
- BahdanauAttention을 사용한다.

In [9]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w1 = tf.keras.layers.Dense(units)
        self.w2 = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)
        
    def __call__(self, 
                 query, # decoder의 t-1에서의 output
                 values): # encoder의 output들
        
        query = tf.expand_dims(query, 1) # query 는 decoder의 output 1개이므로, 이후에 연산을 위해 dimention 1 추가
        score = self.v(tf.nn.tanh(self.w1(query) + self.w2(values))) # Encoder Cell 별 점수
        
        attention_weights = tf.nn.softmax(score, axis = 1) # Encoder cell 별 weight(점수 기반)
        
        context_vector = attention_weights * values 
        context_vector = tf.reduce_sum(context_vector, axis = 1) # 
        
        return context_vector, attention_weights

### Decoder 모델 정의
- Decoder 는 이전의 output(output of t-1)의 값을 입력으로 다시 받는 RNN 모델
- BahdanauAttention 사용 시에는 각 입력값에 context_vector 가 추가된 값이 들어가 Encoder의 중요 cell의 가중치를 확인할 수 있다.

In [10]:
class Decoder(tf.keras.Model):
    def __init__(self, 
                 units, # encoder로 들어오는 unit의 개수
                 vocab_size, # 임베딩전 단어의 개수
                 embedding_units): # 임베딩 한 유닛의 개수
        
        super(Decoder, self).__init__()
        self.dec_units = units 
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_units) 
                
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
        self.fc = tf.keras.layers.Dense(vocab_size) # hidden_state 에서 특정 단어로 만들어주는 layer
                
        self.attention = BahdanauAttention(units) 
        
    def __call__(self, x, # 시간 t에 대한 1개의 sequence를 가진 input, shape of (batch_size, 1, embedding_units)
                 hidden, # t-1 시간의 hidden_state, initial hidden_state 는 encoder의 마지막 hidden_state
                 enc_output): # attention에 넣을 encoder output들(hidden_state들)
        
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x) # shape of (batch_size, 1, embedding_units)
                
        x = tf.concat([tf.expand_dims(context_vector, 1),x],axis = -1)
        # context vector shape (batch_size, hidden_state) 이므로 x와 concat 하기 위해 축 1개 추가
        
        output, state = self.gru(x)        
        output = tf.reshape(output, (-1, output.shape[2])) # fc에 넣기 위해 shape 조정
        
        x = self.fc(output) # shape of (batch_size, vocab_size)
        
        return x, state

### 정의한 모델 확인
- Encoder 
- Attention
- Decoder

In [35]:
batch_size = 64
units = 1024
vocab_size = len(word_to_token)
embedding_units = 256

inputs = pd.DataFrame(df.inputs_token.tolist(), index= df.index).values
targets = pd.DataFrame(df.targets_token.tolist(), index= df.index).values

steps = len(inputs)//batch_size

In [36]:
encoder = Encoder(units,vocab_size,embedding_units)
enc_output, enc_hidden = encoder(inputs[:batch_size])

print("enc_output shape : ", enc_output.shape)
print("enc_hidden shape : ", enc_hidden.shape)

enc_output shape :  (64, 25, 1024)
enc_hidden shape :  (64, 1024)


In [37]:
attention = BahdanauAttention(units)
context_vector, attention_weights = attention(enc_hidden, enc_output)

print("context_vector shape : ", context_vector.shape)
print("attention_weights shape : ", attention_weights.shape)

context_vector shape :  (64, 1024)
attention_weights shape :  (64, 25, 1)


In [38]:
decoder = Decoder(units, vocab_size, embedding_units)
dec_output, dec_hidden = decoder(targets[:batch_size,:1], enc_hidden, enc_output)


print("dec_output shape : ", dec_output.shape)
print("dec_hidden shape : ", dec_hidden.shape)

dec_output shape :  (64, 2442)
dec_hidden shape :  (64, 1024)


## Loss Function 정의 및 Train step 정의
- Loss Function : 
- Train step :

In [43]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, hypothesis):    
    #mask = tf.math.logical_not(tf.math.equal(real, 0))   # <padding> 일 경우에는 loss값을 얻지 않기 위한 mask
    loss_ = loss_object(real, hypothesis)
    
    #mask = tf.cast(mask, dtype=loss_.dtype) # mask 와 loss 를 곱해주기 위해 type 변경
    #loss_ *= mask
    return tf.reduce_mean(loss_)

In [44]:
@tf.function
def train_step(dataset):
    loss = 0        
    
    train_input, train_target = dataset        
    max_length = train_target.shape[1]
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(train_input)
        
        dec_hidden = enc_hidden 
        dec_input = tf.ones([train_target.shape[0], 1])
        
        for t in range(max_length):
            prediction, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(train_target[:,t:t+1], prediction)
            
            dec_input = train_target[:, t:t+1]
    
    batch_loss = loss / max_length
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients,variables))
    
    return batch_loss

In [45]:
def shuffle(inputs, targets):
    random_index = np.arange(inputs.shape[0])
    np.random.shuffle(random_index)
    
    return inputs[random_index], targets[random_index]

In [46]:
epochs = 20

for e in range(epochs+1):
    total_loss = 0
    
    for s in range(steps):
        batched_dataset = (inputs[batch_size * s: batch_size * s+1], targets[batch_size * s: batch_size * s+1])
        loss = train_step(batched_dataset)
        total_loss += loss
        
    print("At epochs {}, total loss is {}".format(e, total_loss))    

At epochs 0, total loss is 201.13304138183594
At epochs 1, total loss is 106.64305877685547
At epochs 2, total loss is 93.79425811767578
At epochs 3, total loss is 88.33040618896484
At epochs 4, total loss is 82.79292297363281
At epochs 5, total loss is 77.77149963378906
At epochs 6, total loss is 72.59521484375
At epochs 7, total loss is 67.55438232421875
At epochs 8, total loss is 61.37772750854492
At epochs 9, total loss is 55.68754196166992
At epochs 10, total loss is 50.69232177734375
At epochs 11, total loss is 44.8255500793457
At epochs 12, total loss is 41.03865051269531
At epochs 13, total loss is 67.57832336425781
At epochs 14, total loss is 39.26240158081055
At epochs 15, total loss is 34.52922821044922
At epochs 16, total loss is 34.31454086303711
At epochs 17, total loss is 29.90668296813965
At epochs 18, total loss is 25.68297576904297
At epochs 19, total loss is 22.920162200927734
At epochs 20, total loss is 20.527727127075195


## Predict with question
- 모델 학습이 되었는지 채팅으로 확인하기

In [47]:
def detokenize(tokenized_sentence):
    sentence = []
    for index in tokenized_sentence:
        index = int(index)
        sentence.append(token_to_word[index])
    return sentence

def cut_from_start_to_end(sentence):
    answer = []
    for word in sentence:
        if word != '<start>' and word != "<end>":
            answer.append(word)
        elif word == "<end>":
            break
    return " ".join(answer)
    
def chat(question):
    question = preprocess_string(question)
    question = tokenize(question,training = False)
    question = padding(question, max_length)
    
    question = tf.expand_dims(question,0)
    
    answer = []
    
    enc_output, enc_hidden = encoder(question)

    dec_hidden = enc_hidden 
    dec_input = tf.ones([1, 1]) # <start> as input

    for t in range(max_length):
        answer.append(dec_input[0][0])
        prediction, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
        dec_input = tf.expand_dims(tf.argmax(prediction,axis = 1),0)
    
    answer = detokenize(answer)
    answer = cut_from_start_to_end(answer)
    
    return answer

In [48]:
for i in range(20):
    question = df.question.loc[i]
    print("question:",question,"\nanswer\t:",chat(question))

question: hi, how are you doing? 
answer	: then why does everyone lock their doors ?
question: i'm fine. how about yourself? 
answer	: i would wait to graduate .
question: i'm pretty good. thanks for asking. 
answer	: i would wait to graduate .
question: no problem. so how have you been? 
answer	: i thought i thought i thought i thought i thought i thought i thought i thought i thought i thought i thought i
question: i've been great. what about you? 
answer	: i would wait until october .
question: i've been good. i'm in school right now. 
answer	: i would wait until october .
question: what school do you go to? 
answer	: i thought i don t like it ?
question: i go to pcc. 
answer	: i would wait until october .
question: do you like it there? 
answer	: all night long we heard people are a good reason .
question: it's okay. it's a really big campus. 
answer	: i would buy her cd if you would buy her cd if you would buy her cd if you would buy her cd
question: good luck with school. 
answer