In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM, Dense
from keras.models import Model, load_model
import numpy as np

Using TensorFlow backend.


In [2]:
import nltk
import jieba

In [3]:
def use_gpu(gpu_id):
    import os
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)

In [4]:
# for multiple gpus
use_gpu(3)

In [5]:
DATA = 'cmn.txt'
ENG_EMBEDDING = 'glove.840B.300d.txt'
EMBEDDING_DIM = 300
LATENT_DIM = 256

In [6]:
BATCH_SIZE = 64
EPOCHS = 100
SAVED_MODEL = 'nmt.h5'

In [None]:
def download_word_embedding():
    import os
    if not os.path.exists(ENG_EMBEDDING):
        if not os.path.exists('glove.840B.300d.zip'):
            os.system('wget http://nlp.stanford.edu/data/glove.840B.300d.zip')
        os.system('unzip cmn-eng.zip')

In [None]:
def download_dataset():
    import os
    if not os.path.exists(DATA):
        if not os.path.exists('cmn-eng.zip'):
            os.system('wget http://www.manythings.org/anki/cmn-eng.zip')
        os.system('unzip cmn-eng.zip')

In [7]:
def load_word_embedding(filename, dimension):
    embeddings_index = {}
    with open(filename, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = ''.join(values[:-dimension])
            embeddings_index[word] = np.asarray(values[-dimension:], dtype='float32')
    return embeddings_index

In [8]:
def load_dataset(filename):
    with open(filename, encoding='utf-8') as f:
        lines = [line.strip().split('\t') for line in f]
        return zip(*lines)

In [9]:
def get_word_index(lines, tokenizer, append_start=False, append_end=False):
    # default word index (stopping token is 0)
    word_index = {
        'unk': 1,
        '<S>': 2,
    }
    index = 3
    # each line to a sequence of index
    seq = []
    for line in lines:
        preprocessed_line = tokenizer(line)
        if append_start:
            preprocessed_line = ['<S>'] + preprocessed_line
        if append_end:
            preprocessed_line = preprocessed_line + ['<S>']
        # register in word_index
        for word in preprocessed_line:
            if word not in word_index:
                word_index[word] = index
                index += 1
        seq.append([word_index[word] for word in preprocessed_line])
    # pad sequence with 0 (stopping token)
    data = pad_sequences(seq, padding='post')
    return data, word_index

## Load Data

In [None]:
# download_word_embedding()
# download_dataset()

In [10]:
embeddings_index = load_word_embedding(ENG_EMBEDDING, EMBEDDING_DIM)

In [11]:
en_lines, ch_lines = load_dataset(DATA)

## English

In [12]:
en_data, en_word_index = get_word_index(en_lines, nltk.word_tokenize, append_end=True)

In [13]:
# eng embedding
zeros = np.zeros(EMBEDDING_DIM)
embedding_matrix = np.zeros((len(en_word_index) + 1, EMBEDDING_DIM))
for word, i in en_word_index.items():
    embedding_matrix[i] = embeddings_index.get(word, zeros)

In [14]:
embedding_1 = Embedding(*embedding_matrix.shape,
                        weights=[embedding_matrix],
                        mask_zero=True,
                        trainable=False)

## Chinese

In [15]:
ch_data, ch_word_index = get_word_index(ch_lines, jieba.lcut, append_start=True)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/chengscott/venv-nlp/lib/python3.6/site-packages/jieba/__init__.py", line 152, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpe5fm0g54' -> '/tmp/jieba.cache'
Loading model cost 1.901 seconds.
Prefix dict has been built succesfully.


In [16]:
target_data, _ = get_word_index(ch_lines, jieba.lcut, append_end=True)
target_data = target_data[:, :, np.newaxis]

In [17]:
# ch embedding
embedding_matrix = np.random.random(size=(len(ch_word_index) + 1, EMBEDDING_DIM))
embedding_matrix[0] = np.zeros(EMBEDDING_DIM)

In [18]:
embedding_2 = Embedding(*embedding_matrix.shape,
                        weights=[embedding_matrix],
                        mask_zero=True,
                        trainable=True)

## NMT Training Model

In [19]:
max_en_seq_len = en_data.shape[0]
max_ch_seq_len = ch_data.shape[0]
num_decoder_tokens = len(ch_word_index)

In [20]:
en_seq_inputs = Input(shape=(None,))
encoder_inputs = embedding_1(en_seq_inputs)

In [21]:
ch_seq_inputs = Input(shape=(None,))
decoder_inputs = embedding_2(ch_seq_inputs)

In [22]:
encoder = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [23]:
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

In [24]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
dense_outputs = decoder_dense(decoder_outputs)

In [25]:
model = Model([en_seq_inputs, ch_seq_inputs], dense_outputs)

In [26]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    2067000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    4095300     input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

## NMT Training

In [28]:
# Run training
model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy')
model.fit([en_data, ch_data], target_data,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.2)

Train on 16235 samples, validate on 4059 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
  960/16235 [>.............................] - ETA: 42s - loss: 0.7674

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f76c27cde80>

In [29]:
model.save(SAVED_MODEL)

  '. They will not be included '


## NMT Inference Model

In [30]:
model = load_model(SAVED_MODEL)

In [31]:
en_seq_inputs = model.input[0]
encoder_inputs = model.layers[2](en_seq_inputs)

In [32]:
ch_seq_inputs = model.input[1]
decoder_inputs = model.layers[3](ch_seq_inputs)

In [33]:
encoder = model.layers[4]
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [34]:
encoder_model = Model(en_seq_inputs, encoder_states)

In [35]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 300)         2067000   
_________________________________________________________________
lstm_1 (LSTM)                [(None, 256), (None, 256) 570368    
Total params: 2,637,368
Trainable params: 570,368
Non-trainable params: 2,067,000
_________________________________________________________________


In [36]:
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='input_h')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [37]:
decoder_lstm = model.layers[5]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs,
                                     initial_state=decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]

In [38]:
decoder_dense = model.layers[6]
dense_outputs = decoder_dense(decoder_outputs)

In [39]:
decoder_model = Model(
    [ch_seq_inputs] + decoder_states_inputs,
    [dense_outputs] + decoder_states)

In [40]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    4095300     input_2[0][0]                    
__________________________________________________________________________________________________
input_h (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_c (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_2 (LS

## NMT Inference

In [44]:
ch_index_word = {c: i for i, c in ch_word_index.items()}

In [47]:
def nmt_inference(input_seq):
    states_value = encoder_model.predict(input_seq)
    # <s> starting word
    target_seq = np.array([[2.]])
    decoded_sentence = []
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        output_index = np.argmax(output_tokens[0, -1, :])
        word = ch_index_word[output_index]
        decoded_sentence.append(word)
        if word == '<S>':
            break
        # update states
        target_seq = np.array([[output_index]])
        states_value = [h, c]
    return ''.join(decoded_sentence)

In [48]:
line_no = [4077, 2122, 3335, 1464, 8956, 7168, 3490, 4495, 5100, 119]
line_no = [n - 1 for n in line_no]
for n in line_no:
    input_seq = en_data[n: n + 1]
    decoded_sentence = nmt_inference(input_seq)
    print('-')
    print('Input sentence:', en_lines[n], sep='\t')
    # print('Train sentence:', ch_lines[n], sep='\t')
    print('Decoded sentence:', decoded_sentence, sep='\t')

-
Input sentence:	He is afraid of snakes.
Decoded sentence:	他害怕蛇。<S>
-
Input sentence:	I miss you so much.
Decoded sentence:	好想見到你<S>
-
Input sentence:	We're going by train.
Decoded sentence:	我们要乘火车去。<S>
-
Input sentence:	The sky is clear.
Decoded sentence:	天空很晴朗。<S>
-
Input sentence:	Wearing a suit, he stood out.
Decoded sentence:	他穿著西裝站了出來。<S>
-
Input sentence:	She made a serious mistake.
Decoded sentence:	她犯了一個嚴重的錯誤。<S>
-
Input sentence:	Have you eaten dinner?
Decoded sentence:	你吃晚飯了嗎？<S>
-
Input sentence:	What do you want to be?
Decoded sentence:	你想成为什么？<S>
-
Input sentence:	Tom is going to help us.
Decoded sentence:	汤姆要帮助我们。<S>
-
Input sentence:	He's lazy.
Decoded sentence:	他很懒。<S>
