# English to Katakana seq2seq model (credit to wanasit)

The data is a .csv file, containing english words on the first column and their translation in katakana in the second column (built by wanasit, see https://wanasit.github.io/english-to-katakana-using-sequence-to-sequence-in-keras.html for more details). We're going to build a model that automatically does the conversion from english to katakana.

In [48]:
import os
import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from keras.models import Model, load_model

In [49]:
data = pd.read_csv('./trainingdata/joined_titles.csv', header=None)
print(data.sample(20, random_state=1))

                        0                1
76132           Chewbacca           チューバッカ
88968         Marin Bikes         マリン・バイクス
12942             Greyout           グレイアウト
58630   Georges Rodenbach    ジョルジュ・ローデンバック
20202        Mike Maignan         マイク・メニャン
44311    Jacob Ellehammer        ヤコブ・エレハマー
90928        Samurai Jack         サムライジャック
23014          Jony López  ジョナタン・ロペス・ロドリゲス
44733          Ben Revere           ベン・リビア
63059       Streptokinase        ストレプトキナーゼ
16678              Ossian             オシアン
92801   Gouverneur Morris        ガバヌーア・モリス
61466           Stralsund        シュトラールズント
102620   Friedemann Layer      フリーデマン・レイヤー
32974      Dorothy Malone        ドロシー・マローン
43889           Sally Yeh          サリー・イップ
88822            Parndorf           パルンドルフ
60547            Landshut           ランツフート
86368      Heike Makatsch         ハイケ・マカチュ
12315               Evraz        エブラズ・グループ


Let's turn this data into a X and Y vectors for training first.

In [50]:
X = [word.lower() for word in data[0]]
Y = [word for word in data[1]]

We're not done yet. Our model only takes numerical data and we cannot input strings directly. We have to build a sort of encoding for each character in english and in katakana, but also store the way to decode the characters so we can read the output of the model at the end. Also remember that a model only takes input with same size, so we have to use padding. Let's use 0 for padding, 1 for start of sequence, and just code all the characters as int based on the order they appeared.

In [51]:
SOW_token = 1


class CharEncoder:
    def __init__(self, name):
        self.name = name
        self.char2index = {} #the index will be our encoding of the char
        self.char2count = {}
        self.index2char = {1: "SOS"}
        self.n_chars = 2  # Count SOS 

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

In [52]:
englishEncoder = CharEncoder("english")
katakanaEncoder = CharEncoder("katakana")
for word in X:
    englishEncoder.addWord(word)
for word in Y:
    katakanaEncoder.addWord(word)


In [53]:
print(katakanaEncoder.char2index)
print(englishEncoder.index2char)

{'ア': 2, 'ン': 3, 'ス': 4, 'ク': 5, 'ー': 6, 'リ': 7, 'グ': 8, 'ロ': 9, 'ヴ': 10, 'ォ': 11, 'シ': 12, 'ツ': 13, 'ェ': 14, 'ミ': 15, 'ル': 16, 'ヒ': 17, 'ユ': 18, 'ァ': 19, 'ブ': 20, 'レ': 21, 'ビ': 22, 'ッ': 23, 'ラ': 24, 'サ': 25, 'パ': 26, 'マ': 27, 'ノ': 28, 'ザ': 29, 'ポ': 30, 'ト': 31, 'デ': 32, 'フ': 33, 'テ': 34, 'エ': 35, 'ヘ': 36, 'タ': 37, 'ィ': 38, 'バ': 39, 'キ': 40, 'ペ': 41, 'ソ': 42, 'ナ': 43, 'イ': 44, 'ゼ': 45, 'ョ': 46, 'ダ': 47, 'ゴ': 48, 'ボ': 49, 'カ': 50, 'ガ': 51, 'ハ': 52, 'ベ': 53, 'コ': 54, '・': 55, 'ヌ': 56, 'オ': 57, 'ネ': 58, 'ド': 59, 'ズ': 60, 'モ': 61, 'ウ': 62, 'ジ': 63, 'ニ': 64, 'ュ': 65, 'メ': 66, 'ゲ': 67, 'ャ': 68, 'ピ': 69, 'プ': 70, 'セ': 71, 'ゾ': 72, 'チ': 73, 'ヨ': 74, 'ギ': 75, 'ヤ': 76, 'ホ': 77, 'ム': 78, 'ゥ': 79, 'ワ': 80, 'ケ': 81, ' ': 82, 'ヅ': 83, 'ヂ': 84, 'ヱ': 85, 'ヮ': 86, 'ヰ': 87, 'ヲ': 88}
{1: 'SOS', 2: 'u', 3: 'n', 4: 's', 5: 'c', 6: 'h', 7: 'o', 8: 'l', 9: 'i', 10: 'g', 11: 'v', 12: 'e', 13: 'm', 14: 'j', 15: 'a', 16: 'b', 17: 'r', 18: ' ', 19: 'p', 20: 't', 21: 'd', 22: 'k', 23: 'y', 24: 'z', 25: 'f', 26: '

Now we have succesfully created our encoding based on the input data. We now have to convert the data using the encoding we created. We also have to pad both the input and the output.

In [54]:
def indexesFromWord(encoder, word, word_length):
    transformed_word = [0]*word_length
    for i in range(len(word)):
        transformed_word[i] = encoder.char2index[word[i]]
    return transformed_word

In [55]:
INPUT_LENGTH = max(len(word) for word in X)
OUTPUT_LENGTH = max(len(word) for word in Y)
X = [indexesFromWord(englishEncoder, word, INPUT_LENGTH) for word in X]
Y = [indexesFromWord(katakanaEncoder, word, OUTPUT_LENGTH) for word in Y]

In [56]:
print(X[0:2])

[[2, 3, 4, 5, 6, 7, 7, 8, 9, 3, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [8, 7, 11, 7, 4, 9, 5, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


**Encoder Model**

Now we can start building our model. We first need to turn our input vector to a dense vector. For this, we have to use and Embedding layer. We then add one or more LSTM layers. These layers will serve as our encoder.

In [57]:
encoder_input = Input(shape=(INPUT_LENGTH,))
encoder = Embedding(englishEncoder.n_chars, 64, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(64, return_sequences=False)(encoder)

**Decoder Model**

The decoder model will generate a sequence of Katakanas. Every generated character will be used as the input of the decoder to generate the next one.
The input will be passed to an Embedding layer to transform the input into dense vectors , just as we did for the encoder.
We also need the encoder output to initialize the decoder.
The final layer will be (time distributed) Dense layer that will produce the softmax prediction.

In [58]:
decoder_input = Input(shape=(OUTPUT_LENGTH,))
decoder = Embedding(katakanaEncoder.n_chars, 64, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(64, return_sequences=True)(decoder, initial_state=[encoder, encoder])
decoder = TimeDistributed(Dense(katakanaEncoder.n_chars, activation="softmax"))(decoder)

What we need to do in order to use this decoder model is to have a one-hot encoded structure for the softmax prediction. Also, we need to add the start sequence char at the beginning of every word in order to have our first input character for the decoder. We will also use crossvalidation to prevent overfitting.

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.1, random_state=2)

In [60]:
# Encoder Input Train
encoder_input_train = X_train

# Decoder Input Train
decoder_input_train = np.zeros_like(Y_train)
decoder_input_train[:][1:] = Y_train[:][:-1]
decoder_input_train[:][0] = SOW_token

# Decoder OutputTrain
decoder_output_train = np.eye(katakanaEncoder.n_chars)[Y_train]

In [61]:
# Encoder Input Validation
encoder_input_validation = X_val

# Decoder Input Validation
decoder_input_validation = np.zeros_like(Y_val)
decoder_input_validation[:][1:] = Y_val[:][:-1]
decoder_input_validation[:][0] = SOW_token

# Decoder Output Validation
decoder_output_validation = np.eye(katakanaEncoder.n_chars)[Y_val]

And we're done! We can now train our model.

**Model training**

In [62]:
from keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
from keras.callbacks import ReduceLROnPlateau

if os.path.isfile('model.h5'):
    model = load_model('model.h5')
else:
    model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder])
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit(x=[encoder_input_train, decoder_input_train], y=[decoder_output_train],
              validation_data=([encoder_input_validation, decoder_input_validation], [decoder_output_validation]),
                                epochs = 60,
                                 batch_size = 64)
   
    
model.save('model.h5')

Train on 96534 samples, validate on 10727 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
21632/96534 [=====>........................] - ETA: 8:51 - loss: 2.6405

MemoryError: 