In [3]:
from __future__ import print_function

import os
import numpy as np
import pandas as pd

from keras.models import Model
from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
data = pd.read_csv('../data/eng-katakana.csv', header=None, names=['eng', 'katakana'])
data = data.sample(frac=1, random_state=0)

In [5]:
data.head()

Unnamed: 0,eng,katakana
11206,Dorogobuzh,ドロゴブージ
80376,Gail Hopkins,ゲイル・ホプキンス
38108,Novatek,ノヴァテク
29960,Gyula Cseszneky,チェスネキー・ジュラ
22295,Occhieppo Superiore,オッキエッポ・スペリオーレ


In [6]:
data_input = [s.lower() for s in data['eng']]
data_output = [s for s in data['katakana']]

print(data_input[0:3])
print(data_output[0:3])

['dorogobuzh', 'gail hopkins', 'novatek']
['ドロゴブージ', 'ゲイル・ホプキンス', 'ノヴァテク']


In [7]:
# Split train and test
training_rate = 0.7
train_len = int(len(data) * training_rate)
training_input = data_input[:train_len]
training_output = data_output[:train_len]
validation_input = data_input[train_len:]
validation_output = data_output[train_len:]

print(len(training_input))
print(len(validation_input))

75082
32179


### Encoding character input

We will create a character dictionary and encode the title from a string (a sequence of character) into a sequence of IDs. We will also create the reverse dictionary that will be used for getting the result later.

Note that in practice, we must not build the dictionary from all data (`data_input` and `data_output`), but only use the training set (`training_input` and `training_output`). We also have to handle out-of-dictionary characters. However, for now, I will skip that part.

Note:
- We will use 0 for padding and 1 for 'START'. So, `count` starts from 2. 
- This is to take advantage of `mask_zero=True` feature for Embedding Layer in Keras

In [8]:
START_CHAR_CODE = 1

def encode_characters(titles):
    count = 2
    encoding = {}
    decoding = {1: 'START'}
    for c in set([c for title in titles for c in title]):
        encoding[c] = count
        decoding[count] = c
        count += 1
    return encoding, decoding, count


input_encoding, input_decoding, input_dict_size = encode_characters(data_input)
output_encoding, output_decoding, output_dict_size = encode_characters(data_output)


print('English character dict size:', input_dict_size)
print('Katakana character dict size:', output_dict_size)

print(input_encoding)
print(input_decoding)

English character dict size: 54
Katakana character dict size: 89
{'l': 2, 'a': 3, 'x': 4, 'u': 5, 'ŵ': 6, 'ê': 7, 'ź': 8, 'ý': 9, '9': 10, 'j': 11, 'w': 12, 'v': 13, 'ò': 14, 'ż': 15, 'õ': 16, 'ó': 17, 't': 18, 'm': 19, 'ú': 20, 'ù': 21, '0': 22, 'c': 23, 'ü': 24, 'q': 25, '5': 26, 's': 27, 'þ': 28, 'o': 29, '3': 30, 'r': 31, '1': 32, ' ': 33, '2': 34, '8': 35, 'ž': 36, 'd': 37, 'k': 38, 'y': 39, '4': 40, 'p': 41, 'e': 42, '6': 43, 'n': 44, '7': 45, 'f': 46, 'b': 47, 'i': 48, 'z': 49, 'ļ': 50, 'ľ': 51, 'g': 52, 'h': 53}
{1: 'START', 2: 'l', 3: 'a', 4: 'x', 5: 'u', 6: 'ŵ', 7: 'ê', 8: 'ź', 9: 'ý', 10: '9', 11: 'j', 12: 'w', 13: 'v', 14: 'ò', 15: 'ż', 16: 'õ', 17: 'ó', 18: 't', 19: 'm', 20: 'ú', 21: 'ù', 22: '0', 23: 'c', 24: 'ü', 25: 'q', 26: '5', 27: 's', 28: 'þ', 29: 'o', 30: '3', 31: 'r', 32: '1', 33: ' ', 34: '2', 35: '8', 36: 'ž', 37: 'd', 38: 'k', 39: 'y', 40: '4', 41: 'p', 42: 'e', 43: '6', 44: 'n', 45: '7', 46: 'f', 47: 'b', 48: 'i', 49: 'z', 50: 'ļ', 51: 'ľ', 52: 'g', 53: 'h'}


### Transforming the titles

In [9]:
def transform(encoding, data, vector_size):
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            transformed_data[i][j] = encoding[data[i][j]]
    return transformed_data

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoded_training_input = transform(input_encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(output_encoding, training_output, vector_size=OUTPUT_LENGTH)
encoded_validation_input = transform(input_encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(output_encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('input', encoded_training_input)
print('output', encoded_training_output)

input [[37. 29. 31. ...  0.  0.  0.]
 [52.  3. 48. ...  0.  0.  0.]
 [44. 29. 13. ...  0.  0.  0.]
 ...
 [11. 29. 53. ...  0.  0.  0.]
 [52. 44.  3. ...  0.  0.  0.]
 [42. 53. 31. ...  0.  0.  0.]]
output [[85. 61. 41. ...  0.  0.  0.]
 [21. 30. 39. ...  0.  0.  0.]
 [46. 12.  9. ...  0.  0.  0.]
 ...
 [55.  3. 35. ...  0.  0.  0.]
 [58.  5.  6. ...  0.  0.  0.]
 [33. 88. 54. ...  0.  0.  0.]]


# Sequence-to-Sequence in Keras

In [10]:
encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

### Encoder

First, we will use [Embedding layer](https://keras.io/layers/embeddings/) to transform input char-id sequence into dense vectors.  

The input vectors will be passed to a [Recurrent layer](https://keras.io/layers/recurrent/) (we use LSTM) that will transform the vectors of each input character to a single output vector.

In [11]:
encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))
print(encoder_input)

Tensor("input_3:0", shape=(?, 20), dtype=float32)


In [12]:
# Encoder
encoder = Embedding(input_dict_size, 64, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
print(encoder.get_shape())
encoder = LSTM(64)(encoder)
print(encoder.get_shape())

(?, 20, 64)
(?, 64)


### Decoder

Our decoder generate Katakana sequence (as a softmax prediction) on characrter at the time. Every generated output at decoding step will be passed back as an input of the decoder to generate the next output.

Similar to the encoder, the input will be passed to an Embedding layer to transform the input into dense vectors and pass them to LSTM.

We will use the encoder's output to initialize decoder state (`initial_state`).

The final layer will be (time distributed) Dense layer that will produce the softmax prediction.

In [13]:
decoder = Embedding(output_dict_size, 64, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
print(decoder.get_shape())
decoder = LSTM(64, return_sequences=True)(decoder, initial_state=[encoder, encoder])
print(decoder.get_shape())
decoder = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

print(decoder.get_shape())

(?, 20, 64)
(?, ?, 64)
(?, 20, 89)


In [14]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder])
model.compile(optimizer='adam', loss='binary_crossentropy')

In [15]:
print(encoded_training_input.shape)
print(encoded_training_output.shape)

(75082, 20)
(75082, 20)


In [16]:
# Encoder Input
training_encoder_input = encoded_training_input

# Decoder Input (need padding py START_CHAR_CODE)
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1] # offset one timestpe
training_decoder_input[:, 0] = START_CHAR_CODE # first timestep is 1, means START

# Decoder Output (one-hot encode)
training_decoder_output = np.eye(output_dict_size)[encoded_training_output.astype('int')]

print('encoder input', training_encoder_input[:1])
print('decoder input', training_decoder_input[:1])
print('decoder output', training_decoder_output[:1].argmax(axis=2))
print('decoder output (one-hot)', training_decoder_output[:1])

encoder input [[37. 29. 31. 29. 52. 29. 47.  5. 49. 53.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
decoder input [[ 1. 85. 61. 41. 75. 19. 55.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
decoder output [[85 61 41 75 19 55  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
decoder output (one-hot) [[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]


In [17]:
print(training_encoder_input.shape)
print(training_decoder_input.shape)
print(training_decoder_output.shape)

(75082, 20)
(75082, 20)
(75082, 20, 89)


In [18]:
validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = START_CHAR_CODE
validation_decoder_output = np.eye(output_dict_size)[encoded_validation_output.astype('int')]

## Training the model

In [None]:
# if os.path.isfile('s2s_modify.h5'):
#     model = load_model('s2s_modify.h5')
# else:
#     model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
#           validation_split=0.11,
#           verbose=1,
#           batch_size=64,
#           epochs=2)
    
# model.save('s2s_modify.h5')

In [22]:
earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
      validation_split=0.11,
      verbose=1,
      batch_size=64,
      epochs=20,
      callbacks=[earlystopper])

Train on 66822 samples, validate on 8260 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x117591828>

In [23]:
from keras.utils import plot_model 
plot_model(model, show_shapes=True, to_file='s2s_modify.png')

### Testing the model

During the testing or after deploy the model, to generate the output we will use "greedy" generating approach, which is generating one output at a time by maximize softmax score and feed the output back as the next decoder input character. 

We won't use [beam-search decoding](https://www.quora.com/Why-is-beam-search-required-in-sequence-to-sequence-transduction-using-recurrent-neural-networks)

In [24]:
def generate(text):
    encoder_input = transform(input_encoding, [text.lower()], 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = START_CHAR_CODE
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return decoder_input[:,1:]

def decode(decoding, sequence):
    text = ''
    for i in sequence:
        if i == 0:
            break
        text += output_decoding[i]
    return text

def to_katakana(text):
    decoder_output = generate(text)
    return decode(output_decoding, decoder_output[0])

If the model is trained correctly, typical names should be translate correctly.

In [25]:
common_american_names = ['James', 'John', 'Robert', 'Mary', 'Patricia', 'Linda']
for name in common_american_names:
    print(name, to_katakana(name))

James ジェームス
John ジョン
Robert ロバート
Mary マリー
Patricia パトリシア
Linda リンダ


Because we train the model with mostly people and places names, some English words may not be written correctly.

In [26]:
print(to_katakana('computer'))
print(to_katakana('taxi'))

コンプター
タッキー
