In [1]:
pip install keras


Note: you may need to restart the kernel to use updated packages.


In [77]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [78]:
lines = pd.read_table(r'C:\Users\Hp\Downloads\korean.txt', names=['english', 'korean'])
lines = lines[:8000]
lines.sample(5)

Unnamed: 0,english,korean
3633,The fact that Tom didn't do that can't be denied.,톰이 그걸 안 했다는 사실은 부인할 수 없는 사실이야.
2492,All men are equal under the law.,모든 사람은 법 아래에 동등해.
3783,You should try to see things from the practica...,실용적 관점으로 보려고 노력해야 할 것 같아.
3671,"Well, you obviously have romantic feelings for...","흠, 너는 확실히 톰에게 연애감정을 가지고 있군."
2839,How long will you be staying there?,거기서 얼마나 기다리고 있을 거야?


In [79]:
lines.shape

(3833, 2)

In [80]:
lines.english = lines.english.apply(lambda x: x.lower())


In [81]:
import re
lines.english = lines.english.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))
lines.korean = lines.korean.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", 'COMMA', x))

In [82]:
import string
exclude = set(string.punctuation)
lines.english = lines.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.korean = lines.korean.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [83]:
from string import digits
remove_digits = str.maketrans('', '', digits)
lines.english = lines.english.apply(lambda x: x.translate(remove_digits))
lines.korean = lines.korean.apply(lambda x: x.translate(remove_digits))

In [84]:
lines.sample(10)

Unnamed: 0,english,korean
587,keep tom there,톰은 여기에 두세요
3265,is there anything youd like to tell me,뭔가 하고 싶은 얘기 없어
2972,tom bought this horse at an auction,톰은 이 말을 경매에서 샀어
2430,music is an artCOMMA not a science,음악은 예술이지COMMA 과학이 아니야
1969,whats your skype username,네 스카이프 사용자 이름은 뭐야
1385,someone stole my watch,누군가 내 시계를 훔쳤어
2143,her faith in god is unshaken,그녀의 신앙심은 흔들림이 없다
1747,i think you both know tom,너희들 둘 다 톰을 아는 것 같은데
1945,tom often sings in his car,톰은 자기 차 안에서 가끔 노래해
122,fantastic,끝내주네


In [85]:
# applying start and end tokens in french sentences
lines.korean = lines.korean.apply(lambda x: 'START_' + ' ' + x + ' ' + '_END')
lines.head()

Unnamed: 0,english,korean
0,go,START_ 가 _END
1,hi,START_ 안녕 _END
2,run,START_ 뛰어 _END
3,run,START_ 뛰어 _END
4,who,START_ 누구 _END


In [86]:
# collecting all unique english words to create a vocabulary
all_english_words = set()
for eng in lines.english:
  for word in eng.split():
    if word not in all_english_words:
      all_english_words.add(word)

# collecting all unique french words to create a vocabulary
all_korean_words = set()
for kre in lines.korean:
  for word in kre.split():
    if word not in all_korean_words:
      all_korean_words.add(word)

In [87]:
# printing length of words in each language
print('length of english words: ', len(all_english_words))
print('length of korean words: ', len(all_korean_words))

length of english words:  2707
length of korean words:  5816


In [88]:
# getting maximum sentence length of english sentences
length_list = []
for l in lines.english:
  length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_input_length: ', max_input_length)

max_input_length:  101


In [89]:
# getting maximum sentence length of french sentences
length_list = []
for l in lines.korean:
  length_list.append(len(l.split(' ')))

max_output_length = np.max(length_list)
print('max_output_length: ', max_output_length)

max_output_length:  91


tokenization

In [90]:
# making a list of all input and output words and sorting them out
input_words = sorted(list(all_english_words))
output_words = sorted(list(all_korean_words))
print('all input words: ', input_words)
print('all output words: ', output_words)

#getting total tokens(words) from input and output
num_encoder_tokens = len(all_english_words)
num_decoder_tokens = len(all_korean_words)
print('encoder tokens: ', num_encoder_tokens)
print('decoder tokens: ', num_encoder_tokens)

all output words:  ['A와', 'Birthday', 'B의', 'COMMA', 'COMMA°C일', 'DNA', 'D를', 'Happy', 'Mary가', 'START_', 'TV로', 'Tom과', 'Tom은', 'Tom이', '_END', '가', '가게', '가격은', '가격을', '가격이', '가고', '가곤', '가기', '가기로', '가까운', '가까이', '가까이서', '가끔', '가난한', '가난했었다', '가난했었어', '가는', '가는지', '가능성이', '가능하지', '가능하지는', '가능한', '가능해', '가도', '가라고', '가려', '가려면', '가르쳐', '가르쳤는지', '가르쳤어', '가르친', '가르칠', '가리는', '가만히', '가방', '가방에', '가방을', '가방이', '가버려서', '가벼워', '가본', '가봐야', '가봤지', '가상해', '가서', '가설이', '가수가', '가수는', '가수야', '가야할지', '가야해', '가위', '가을보다', '가을이', '가자', '가장', '가정', '가져', '가져갈', '가져도', '가져와', '가졌다', '가족', '가족과', '가족은', '가지', '가지고', '가지를', '가지세요', '가진', '가진것을', '가질', '가짜같진', '가치', '가치가', '가호가', '각도의', '각이', '간', '간다', '간다고', '간단하게', '간단해', '간신히', '간에', '갇혀본', '갈', '갈거래', '갈거야', '갈게', '갈래에', '갈색', '감기', '감기가', '감기는', '감기에', '감당할', '감동한', '감사드립니다', '감사함을', '감염과', '감염되었어', '감옥에', '감옥에서', '감옥처럼', '감으세요', '감자에', '감자와', '감자칩을', '감정을', '갑자기', '값어치가', '값을', '갔다', '갔다고', '갔어', '갔어야', '갔었던', '갔었습니까', '갔었어', '갔었어야지', '갔으면', '갔을

In [91]:
# getting index for words as these indexes will behave as words for machine interactions
input_token_index = dict([(word,i) for i,word in enumerate(input_words)])
output_token_index = dict([(word,i) for i,word in enumerate(output_words)])

print('input token index: ', input_token_index)
print('output token index: ', output_token_index)

output token index:  {'A와': 0, 'Birthday': 1, 'B의': 2, 'COMMA': 3, 'COMMA°C일': 4, 'DNA': 5, 'D를': 6, 'Happy': 7, 'Mary가': 8, 'START_': 9, 'TV로': 10, 'Tom과': 11, 'Tom은': 12, 'Tom이': 13, '_END': 14, '가': 15, '가게': 16, '가격은': 17, '가격을': 18, '가격이': 19, '가고': 20, '가곤': 21, '가기': 22, '가기로': 23, '가까운': 24, '가까이': 25, '가까이서': 26, '가끔': 27, '가난한': 28, '가난했었다': 29, '가난했었어': 30, '가는': 31, '가는지': 32, '가능성이': 33, '가능하지': 34, '가능하지는': 35, '가능한': 36, '가능해': 37, '가도': 38, '가라고': 39, '가려': 40, '가려면': 41, '가르쳐': 42, '가르쳤는지': 43, '가르쳤어': 44, '가르친': 45, '가르칠': 46, '가리는': 47, '가만히': 48, '가방': 49, '가방에': 50, '가방을': 51, '가방이': 52, '가버려서': 53, '가벼워': 54, '가본': 55, '가봐야': 56, '가봤지': 57, '가상해': 58, '가서': 59, '가설이': 60, '가수가': 61, '가수는': 62, '가수야': 63, '가야할지': 64, '가야해': 65, '가위': 66, '가을보다': 67, '가을이': 68, '가자': 69, '가장': 70, '가정': 71, '가져': 72, '가져갈': 73, '가져도': 74, '가져와': 75, '가졌다': 76, '가족': 77, '가족과': 78, '가족은': 79, '가지': 80, '가지고': 81, '가지를': 82, '가지세요': 83, '가진': 84, '가진것을': 85, '가질': 86, '가짜같진': 87, '가치'

In [93]:
# creating arrays of input and output data
encoder_input_data = np.zeros((len(lines.english), max_input_length), dtype='float32')
decoder_input_data = np.zeros((len(lines.korean), max_output_length), dtype='float32')

#one hot encoding the target data as Dense layer only gives one output through softmax layer
decoder_target_data = np.zeros((len(lines.korean), max_output_length, num_decoder_tokens))

MemoryError: Unable to allocate 15.1 GiB for an array with shape (3833, 91, 5816) and data type float64

In [94]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(3833, 101)
(3833, 91)
(3833, 91, 5816)


In [95]:
# putting all the integer values in input, output data and target data
for i,(input_text, output_text) in enumerate(zip(lines.english, lines.korean)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i,t] = input_token_index[word]
  for t,word in enumerate(output_text.split()):
    decoder_input_data[i,t] = output_token_index[word]
    # as decoder target data is ahead of decoder input data, it will not include start_ character(which will be given to decoder model at prediction)
    if t > 0:
      decoder_target_data[i,t-1,output_token_index[word]] = 1

In [96]:
print("encoder input data: ", encoder_input_data[1])
print('decoder input data: ', decoder_input_data[1])
print('decoder target data: ',decoder_target_data[1])
print('shape of sample decoder target data: ', decoder_target_data[1].shape)

encoder input data:  [1094.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.]
decoder input data:  [   9. 3330.   14.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.  

create and training the model

In [97]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [98]:
# setting hyperparameters
embedding_size = 120
lstm_dim = 324

In [99]:
# building model for training stage
#encoder model

encoder_inputs = Input(shape=(None,))
en_x = Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder = LSTM(lstm_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
encoder_states = [state_h, state_c]

In [100]:
# decoder model

decoder_inputs = Input(shape=(None,))
final_dex = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_lstm = LSTM(lstm_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax') 

decoder_outputs = decoder_dense(decoder_outputs)

In [101]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [102]:
model.compile(optimizer='rmsprop',
              loss = 'categorical_crossentropy',
              metrics=['accuracy'])

In [103]:
pip install pydot

Note: you may need to restart the kernel to use updated packages.


In [104]:
pip install graphviz

Note: you may need to restart the kernel to use updated packages.


In [105]:
model.summary()
plot_model(model, to_file='Model_plot.png', show_shapes=True, show_layer_names=True)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, None, 120)    324840      ['input_5[0][0]']                
                                                                                                  
 embedding_4 (Embedding)        (None, None, 120)    697920      ['input_6[0][0]']                
                                                                                            

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 120)    324840      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 120)    697920      ['input_2[0][0]']                
                                                                                              

In [32]:
r = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=16, epochs=10, validation_split=0.10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [106]:
#Inference Stage

#encoder model
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 120)         324840    
                                                                 
 lstm_2 (LSTM)               [(None, 324),             576720    
                              (None, 324),                       
                              (None, 324)]                       
                                                                 
Total params: 901,560
Trainable params: 901,560
Non-trainable params: 0
_________________________________________________________________


In [107]:
#decoder model
decoder_state_input_h = Input(shape=(lstm_dim,))
decoder_state_input_c = Input(shape=(lstm_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2 = Embedding(num_decoder_tokens, embedding_size)(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_state_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs2] + decoder_states2)

In [108]:
# reversing the word index dictionary to get words from index values
reverse_input_char_index = dict((i,char) for char, i in input_token_index.items())
reverse_output_char_index = dict((i,char) for char, i in output_token_index.items())
print(reverse_input_char_index)
print(reverse_output_char_index)

{0: 'A와', 1: 'Birthday', 2: 'B의', 3: 'COMMA', 4: 'COMMA°C일', 5: 'DNA', 6: 'D를', 7: 'Happy', 8: 'Mary가', 9: 'START_', 10: 'TV로', 11: 'Tom과', 12: 'Tom은', 13: 'Tom이', 14: '_END', 15: '가', 16: '가게', 17: '가격은', 18: '가격을', 19: '가격이', 20: '가고', 21: '가곤', 22: '가기', 23: '가기로', 24: '가까운', 25: '가까이', 26: '가까이서', 27: '가끔', 28: '가난한', 29: '가난했었다', 30: '가난했었어', 31: '가는', 32: '가는지', 33: '가능성이', 34: '가능하지', 35: '가능하지는', 36: '가능한', 37: '가능해', 38: '가도', 39: '가라고', 40: '가려', 41: '가려면', 42: '가르쳐', 43: '가르쳤는지', 44: '가르쳤어', 45: '가르친', 46: '가르칠', 47: '가리는', 48: '가만히', 49: '가방', 50: '가방에', 51: '가방을', 52: '가방이', 53: '가버려서', 54: '가벼워', 55: '가본', 56: '가봐야', 57: '가봤지', 58: '가상해', 59: '가서', 60: '가설이', 61: '가수가', 62: '가수는', 63: '가수야', 64: '가야할지', 65: '가야해', 66: '가위', 67: '가을보다', 68: '가을이', 69: '가자', 70: '가장', 71: '가정', 72: '가져', 73: '가져갈', 74: '가져도', 75: '가져와', 76: '가졌다', 77: '가족', 78: '가족과', 79: '가족은', 80: '가지', 81: '가지고', 82: '가지를', 83: '가지세요', 84: '가진', 85: '가진것을', 86: '가질', 87: '가짜같진', 88: '가치', 89: '가치가', 90: 

In [109]:
# function to predict translation
def decode_seq(input_seq):
  state_values = encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1))

  target_seq[0,0] = output_token_index['START_']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + state_values)

    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_char = reverse_output_char_index[sampled_token_index]

    decoded_sentence += ' ' + sampled_char

    if(sampled_char == '_END' or len(decoded_sentence) >100 ):
      stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    state_values = [h,c] 

  return decoded_sentence

In [110]:
# testing the model for a sample from existing data
for seq_index in [1234, 356, 565, 34, 2345,656]:
  input_seq = encoder_input_data[seq_index:seq_index+1]
  decoded_sentence = decode_seq(input_seq)
  print('----')
  print('Input_sentence: ', lines.english[seq_index:seq_index+1])
  print('decoded sentence: ', decoded_sentence)

----
Input_sentence:  1234    dont forget to floss
Name: english, dtype: object
decoded sentence:   돌아올거야 돌아올거야 돌아올거야 야옹하고 야옹하고 가질 가질 만나는 가질 신고 가질 신고 두 인내심을 내야한다 시작한지 해야한다는 필요합니다 파티에서 톰만 전부니 전부니 프랑스어만큼
----
Input_sentence:  356    tom drowned
Name: english, dtype: object
decoded sentence:   돌아올거야 돌아올거야 돌아올거야 야옹하고 야옹하고 가질 가질 만나는 가질 신고 가질 신고 두 인내심을 내야한다 시작한지 해야한다는 필요합니다 파티에서 톰만 전부니 전부니 프랑스어만큼
----
Input_sentence:  565    do you hear me
Name: english, dtype: object
decoded sentence:   돌아올거야 돌아올거야 돌아올거야 야옹하고 야옹하고 가질 가질 만나는 가질 신고 가질 신고 두 인내심을 내야한다 시작한지 해야한다는 필요합니다 파티에서 톰만 전부니 전부니 프랑스어만큼
----
Input_sentence:  34    we try
Name: english, dtype: object
decoded sentence:   돌아올거야 돌아올거야 돌아올거야 야옹하고 야옹하고 가질 가질 만나는 가질 신고 가질 신고 두 인내심을 내야한다 시작한지 해야한다는 필요합니다 파티에서 톰만 전부니 전부니 프랑스어만큼
----
Input_sentence:  2345    tom is a great french teacher
Name: english, dtype: object
decoded sentence:   돌아올거야 돌아올거야 돌아올거야 야옹하고 야옹하고 가질 가질 만나는 가질 신고 가질 신고 두 인내심을 내야한다 시작한지 해야한다는 필요합니다 파티에서 톰만 전부니 전부니 프랑스어만큼
----
Input_sen