In [68]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.callbacks import EarlyStopping

In [32]:
data = pd.read_csv('../data/eng-katakana.csv', header=None, names=['eng', 'katakana'])
data = data.sample(frac=1, random_state=0)

In [33]:
data.head()

Unnamed: 0,eng,katakana
11206,Dorogobuzh,ドロゴブージ
80376,Gail Hopkins,ゲイル・ホプキンス
38108,Novatek,ノヴァテク
29960,Gyula Cseszneky,チェスネキー・ジュラ
22295,Occhieppo Superiore,オッキエッポ・スペリオーレ


In [41]:
input_texts = [s.lower() for s in data['eng']]
target_texts = [s for s in data['katakana']]

print(input_texts[0:3])
print(target_texts[0:3])

['dorogobuzh', 'gail hopkins', 'novatek']
['ドロゴブージ', 'ゲイル・ホプキンス', 'ノヴァテク']


In [62]:
# Split train and test
training_rate = 0.7
train_len = int(len(data) * training_rate)
training_input = data_input[:train_len]
training_output = data_output[:train_len]
validation_input = data_input[train_len:]
validation_output = data_output[train_len:]

print(len(training_input))
print(len(validation_input))

75082
32179


### Encoding character input

We will create a character dictionary and encode the title from a string (a sequence of character) into a sequence of IDs. We will also create the reverse dictionary that will be used for getting the result later.

Note that in practice, we must not build the dictionary from all data (`data_input` and `data_output`), but only use the training set (`training_input` and `training_output`). We also have to handle out-of-dictionary characters. However, for now, I will skip that part.

Note:
- We will use 0 for padding and 1 for 'START'. So, `count` starts from 2. 
- This is to take advantage of `mask_zero=True` feature for Embedding Layer in Keras

In [63]:
START_CHAR_CODE = 1

def encode_characters(titles):
    count = 2
    encoding = {}
    decoding = {1: 'START'}
    for c in set([c for title in titles for c in title]):
        encoding[c] = count
        decoding[count] = c
        count += 1
    return encoding, decoding, count


input_encoding, input_decoding, input_dict_size = encode_characters(data_input)
output_encoding, output_decoding, output_dict_size = encode_characters(data_output)


print('English character dict size:', input_dict_size)
print('Katakana character dict size:', output_dict_size)

print(input_encoding)
print(input_decoding)

English character dict size: 54
Katakana character dict size: 89
{'a': 2, '5': 3, '2': 4, 'g': 5, 'h': 6, 't': 7, 'ó': 8, '4': 9, 'ò': 10, 'þ': 11, 'p': 12, 'ľ': 13, 'ü': 14, '7': 15, ' ': 16, 'ù': 17, 'r': 18, 's': 19, 'v': 20, 'c': 21, 'ž': 22, 'x': 23, 'n': 24, 'ý': 25, 'ź': 26, 'o': 27, 'k': 28, '0': 29, 'õ': 30, 'w': 31, 'e': 32, 'q': 33, 'ż': 34, '1': 35, '6': 36, 'ú': 37, 'ê': 38, 'j': 39, 'f': 40, 'ŵ': 41, '8': 42, 'l': 43, 'd': 44, '9': 45, 'u': 46, 'b': 47, 'm': 48, 'i': 49, 'z': 50, 'y': 51, '3': 52, 'ļ': 53}
{1: 'START', 2: 'a', 3: '5', 4: '2', 5: 'g', 6: 'h', 7: 't', 8: 'ó', 9: '4', 10: 'ò', 11: 'þ', 12: 'p', 13: 'ľ', 14: 'ü', 15: '7', 16: ' ', 17: 'ù', 18: 'r', 19: 's', 20: 'v', 21: 'c', 22: 'ž', 23: 'x', 24: 'n', 25: 'ý', 26: 'ź', 27: 'o', 28: 'k', 29: '0', 30: 'õ', 31: 'w', 32: 'e', 33: 'q', 34: 'ż', 35: '1', 36: '6', 37: 'ú', 38: 'ê', 39: 'j', 40: 'f', 41: 'ŵ', 42: '8', 43: 'l', 44: 'd', 45: '9', 46: 'u', 47: 'b', 48: 'm', 49: 'i', 50: 'z', 51: 'y', 52: '3', 53: 'ļ'}


### Transforming the titles


In [64]:
def transform(encoding, data, vector_size):
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            transformed_data[i][j] = encoding[data[i][j]]
    return transformed_data

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoded_training_input = transform(input_encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(output_encoding, training_output, vector_size=OUTPUT_LENGTH)
encoded_validation_input = transform(input_encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(output_encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('input', encoded_training_input)
print('output', encoded_training_output)

input [[44. 27. 18. ...  0.  0.  0.]
 [ 5.  2. 49. ...  0.  0.  0.]
 [24. 27. 20. ...  0.  0.  0.]
 ...
 [39. 27.  6. ...  0.  0.  0.]
 [ 5. 24.  2. ...  0.  0.  0.]
 [32.  6. 18. ...  0.  0.  0.]]
output [[26. 50. 15. ...  0.  0.  0.]
 [11. 32.  9. ...  0.  0.  0.]
 [84. 56. 69. ...  0.  0.  0.]
 ...
 [24. 25. 20. ...  0.  0.  0.]
 [54. 66.  8. ...  0.  0.  0.]
 [37. 40. 72. ...  0.  0.  0.]]


In [86]:
# Encoder Input
training_encoder_input = encoded_training_input

# Decoder Input (need padding py START_CHAR_CODE)
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1] # offset one timestpe
training_decoder_input[:, 0] = START_CHAR_CODE # first timestep is 1, means START

# Decoder Output (one-hot encode)
training_decoder_output = np.eye(output_dict_size)[encoded_training_output.astype('int')]

print('encoder input', training_encoder_input[:1])
print('decoder input', training_decoder_input[:1])
print('decoder output', training_decoder_output[:1].argmax(axis=2))
print('decoder output (one-hot)', training_decoder_output[:1])

encoder input [[44. 27. 18. 27.  5. 27. 47. 46. 50.  6.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
decoder input [[ 1. 26. 50. 15. 68. 59. 24.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]
decoder output [[26 50 15 68 59 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
decoder output (one-hot) [[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]


In [None]:
validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = START_CHAR_CODE
validation_decoder_output = np.eye(output_dict_size)[encoded_validation_output.astype('int')]

# Sequence-to-Sequence in Keras

In [74]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 64  # Latent dimensionality of the encoding space.

### Encoder

In [79]:
encoder_input = Input(shape=(INPUT_LENGTH,))
print(encoder_input.shape)
encoder = Embedding(input_dict_size, 64, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
print(encoder.shape)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder)
print(state_h.shape)
print(state_c.shape)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

(?, 20)
(?, 20, 64)
(?, 64)
(?, 64)


### Decoder


In [81]:
decoder_input = Input(shape=(OUTPUT_LENGTH,))
print(decoder_input.shape)
decoder = Embedding(output_dict_size, 64, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
print(decoder.shape)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder,
                                     initial_state=encoder_states)
print(decoder_outputs.shape)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
print(decoder_outputs.shape)

(?, 20)
(?, 20, 64)
(?, ?, 64)
(?, 20, 89)


In [83]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_input, decoder_input], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [89]:
earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
      validation_split=0.11,
      verbose=1,
      batch_size=64,
      epochs=2,
      callbacks=[earlystopper])

Train on 66822 samples, validate on 8260 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1827a74518>

In [90]:
def generate(text):
    encoder_input = transform(input_encoding, [text.lower()], 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = START_CHAR_CODE
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return decoder_input[:,1:]

def decode(decoding, sequence):
    text = ''
    for i in sequence:
        if i == 0:
            break
        text += output_decoding[i]
    return text

def to_katakana(text):
    decoder_output = generate(text)
    return decode(output_decoding, decoder_output[0])

In [91]:
common_american_names = ['James', 'John', 'Robert', 'Mary', 'Patricia', 'Linda']
for name in common_american_names:
    print(name, to_katakana(name))

James ジャメス
John ジョン
Robert ロベルト
Mary マリー
Patricia パトリカイ
Linda リンダ
