In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data preprocess

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
import pickle as pkl
import numpy as np

In [None]:
with open("/content/drive/MyDrive/cmn-eng/eng-cn.pkl", "rb") as f:
    seq_pairs = pkl.load(f)

In [None]:
src_sentences = [pair[0] for pair in seq_pairs[:10000]]    #RAM有限，所以只取數據集中一萬筆數據做訓練
tgt_sentences = [pair[1] for pair in seq_pairs[:10000]]

In [None]:
src_sentences[:5]

['<START> hi . <END>',
 '<START> hi . <END>',
 '<START> run . <END>',
 '<START> stop ! <END>',
 '<START> wait ! <END>']

In [None]:
tgt_sentences[:5]

['<START> 嗨 。 <END>',
 '<START> 你 好 。 <END>',
 '<START> 你 用 跑 的 。 <END>',
 '<START> 住 手 ！ <END>',
 '<START> 等 等 ！ <END>']

In [None]:
def create_tokeniser(sentences):
    # create a tokeniser specific to texts
    tokeniser = Tokenizer(filters = ' ')
    tokeniser.fit_on_texts(sentences)
    # 預覽前 3 個data及label
    for i in range(3):
        print("original: {} - word tokenised: {}".format(sentences[i], tokeniser.texts_to_sequences(sentences)[i]))

    return tokeniser.texts_to_sequences(sentences), tokeniser

# word tokenise source and target sentences
src_word_tokenised, src_tokeniser = create_tokeniser(src_sentences)
tgt_word_tokenised, tgt_tokeniser = create_tokeniser(tgt_sentences)

original: <START> hi . <END> - word tokenised: [1, 730, 3, 2]
original: <START> hi . <END> - word tokenised: [1, 730, 3, 2]
original: <START> run . <END> - word tokenised: [1, 322, 3, 2]
original: <START> 嗨 。 <END> - word tokenised: [1, 1284, 3, 2]
original: <START> 你 好 。 <END> - word tokenised: [1, 6, 25, 3, 2]
original: <START> 你 用 跑 的 。 <END> - word tokenised: [1, 6, 138, 268, 7, 3, 2]


In [None]:
# source and target vocabulary dictionaries
src_vocab_dict = src_tokeniser.word_index  #每個單詞的token ID
tgt_vocab_dict = tgt_tokeniser.word_index

src_vocab_size = len(src_vocab_dict) + 1 # 3080 tokens in total
tgt_vocab_size = len(tgt_vocab_dict) + 1 # 2455 tokens in total

In [None]:
tgt_vocab_size

2455

In [None]:
src_max_seq_length = len(max(src_word_tokenised, key = len)) # 11  #數據中最長句子長度
tgt_max_seq_length = len(max(tgt_word_tokenised, key = len)) # 22  #標籤中最長句子長度

In [None]:
tgt_max_seq_length

22

In [None]:
#讓數據及標籤標準化，長度相等，用0填充
src_sentences_padded = pad_sequences(src_word_tokenised, maxlen = src_max_seq_length, padding = "post")  # shape: (10000, 11)
tgt_sentences_padded = pad_sequences(tgt_word_tokenised, maxlen = tgt_max_seq_length, padding = "post")  # shape: (10000, 22)

# increase 1 dimension
src_sentences_padded = src_sentences_padded.reshape(*src_sentences_padded.shape, 1) # shape: (10000, 11, 1)
tgt_sentences_padded = tgt_sentences_padded.reshape(*tgt_sentences_padded.shape, 1) # shape: (10000, 22, 1)

In [None]:
tgt_sentences_padded.shape

(10000, 22, 1)

In [None]:
def encode_input_sequences(tokeniser, max_seq_length, sentences):
    """
    Label encode every sentences to create features X
    """
    # label encode every sentences
    sentences_le = tokeniser.texts_to_sequences(sentences)
    # pad sequences with zeros at the end
    X = pad_sequences(sentences_le, maxlen = max_seq_length, padding = "post")
    return X


def encode_output_labels(sequences, vocab_size):
    """
    One-hot encode target sequences to create labels y
    """
    y_list = []
    for seq in sequences:
        # one-hot encode each sentence
        oh_encoded = to_categorical(seq, num_classes = vocab_size)
        y_list.append(oh_encoded)
    y = np.array(y_list, dtype = np.float32)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# create encoder inputs, decoder inputs and decoder outputs    #n_samples=10000
enc_inputs = encode_input_sequences(src_tokeniser, src_max_seq_length, src_sentences) # shape: (n_samples=10000, src_max_seq_length, 1)
dec_inputs = encode_input_sequences(tgt_tokeniser, tgt_max_seq_length, tgt_sentences) # shape: (n_samples, tgt_max_seq_length, 1)
dec_outputs = encode_input_sequences(tgt_tokeniser, tgt_max_seq_length, tgt_sentences)
dec_outputs = encode_output_labels(dec_outputs, tgt_vocab_size) # shape: (n_samples, tgt_max_seq_length, tgt_vocab_size )

In [None]:
dec_outputs[0].shape

(22, 2455)

### Save dataset

In [None]:
# save required data to a compressed file
'''
np.savez_compressed("/content/drive/MyDrive/cmn-eng/eng-cn_data.npz", enc_inputs = enc_inputs, dec_inputs = dec_inputs, dec_outputs = dec_outputs, src_vocab_size = src_vocab_size)
'''

'\nnp.savez_compressed("/content/drive/MyDrive/cmn-eng/eng-cn_data.npz", enc_inputs = enc_inputs, dec_inputs = dec_inputs, dec_outputs = dec_outputs, src_vocab_size = src_vocab_size)\n'

### Create train data and test data

In [None]:
data = np.load("/content/drive/MyDrive/cmn-eng/eng-cn_data.npz")
print(data.files) # ['enc_inputs', 'dec_inputs', 'dec_outputs', 'src_vocab_size']

# Extract our desired data
enc_inputs = data["enc_inputs"]
dec_inputs = data["dec_inputs"]
dec_outputs = data["dec_outputs"]
src_vocab_size = data["src_vocab_size"].item(0)  #type is int 3080

['enc_inputs', 'dec_inputs', 'dec_outputs', 'src_vocab_size']


In [None]:
# shuffle X and y in unision
shuffler = np.random.permutation(enc_inputs.shape[0])
enc_inputs = enc_inputs[shuffler]
dec_inputs = dec_inputs[shuffler]
dec_outputs = dec_outputs[shuffler]

In [None]:
from sklearn.model_selection import train_test_split


# prepare training and test data
test_ratio = .2
enc_inputs_train, enc_inputs_test = train_test_split(enc_inputs, test_size = test_ratio, shuffle = False)
dec_inputs_train, dec_inputs_test = train_test_split(dec_inputs, test_size = test_ratio, shuffle = False)
y_train, y_test = train_test_split(dec_outputs, test_size = test_ratio, shuffle = False)
X_train = [enc_inputs_train, dec_inputs_train]
X_test = [enc_inputs_test, dec_inputs_test]

### Create Model

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Activation, dot, concatenate
from tensorflow.keras.models import Model
import tensorflow as tf

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt

In [None]:
src_max_seq_length = enc_inputs.shape[1]
tgt_max_seq_length = dec_outputs.shape[1]
tgt_vocab_size = dec_outputs.shape[2]

In [None]:
# hyperparameters
src_wordEmbed_dim = 96
tgt_wordEmbed_dim = 100
latent_dim = 256

def build_seq2seq(src_max_seq_length, src_vocab_size, src_wordEmbed_dim, tgt_max_seq_length, tgt_vocab_size, tgt_wordEmbed_dim, latent_dim, model_name = None):
    """
    Builda an LSTM seq2seq model with Luong attention
    """
    # Build an encoder
    enc_inputs = Input(shape = (src_max_seq_length, ))
    vectors = Embedding(input_dim = src_vocab_size, output_dim = src_wordEmbed_dim, name = "embedding_enc")(enc_inputs)
    enc_outputs_1, enc_h1, enc_c1 = LSTM(latent_dim, return_sequences = True, return_state = True, name = "1st_layer_enc_LSTM")(vectors)
    enc_outputs_2, enc_h2, enc_c2 = LSTM(latent_dim, return_sequences = True, return_state = True, name = "2nd_layer_enc_LSTM")(enc_outputs_1)
    enc_states = [enc_h1, enc_c1, enc_h2, enc_c2]

    # Build a decoder
    dec_inputs = Input(shape = (tgt_max_seq_length, ))
    vectors = Embedding(input_dim = tgt_vocab_size, output_dim = tgt_wordEmbed_dim, name = "embedding_dec")(dec_inputs)
    dec_outputs_1, dec_h1, dec_c1 = LSTM(latent_dim, return_sequences = True, return_state = True, name = "1st_layer_dec_LSTM")(vectors, initial_state = [enc_h1, enc_c1])
    dec_outputs_2 = LSTM(latent_dim, return_sequences = True, return_state = False, name = "2nd_layer_dec_LSTM")(dec_outputs_1, initial_state = [enc_h2, enc_c2])

    # evaluate attention score
    attention_scores = dot([dec_outputs_2, enc_outputs_2], axes = [2, 2])
    attenton_weights = Activation("softmax")(attention_scores)
    context_vec = dot([attenton_weights, enc_outputs_2], axes = [2, 1])
    ht_context_vec = concatenate([context_vec, dec_outputs_2], name = "concatentated_vector")
    attention_vec = Dense(latent_dim, use_bias = False, activation = "tanh", name = "attentional_vector")(ht_context_vec)
    logits = TimeDistributed(Dense(tgt_vocab_size))(attention_vec)
    dec_outputs_final = Activation("softmax", name = "softmax")(logits)

    # integrate as a model
    model = Model([enc_inputs, dec_inputs], dec_outputs_final, name = model_name)
    # compile model
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),
        loss = tf.keras.losses.CategoricalCrossentropy(),   #metrics=[masked_acc, masked_loss]
        metrics = [tf.keras.metrics.CategoricalAccuracy(name='acc')]
    )
    return model

# build our seq2seq model
eng_cn_translator = build_seq2seq(
    src_max_seq_length = src_max_seq_length,
    src_vocab_size = src_vocab_size,
    src_wordEmbed_dim = src_wordEmbed_dim,
    tgt_max_seq_length = tgt_max_seq_length,
    tgt_vocab_size = tgt_vocab_size,
    tgt_wordEmbed_dim = tgt_wordEmbed_dim,
    latent_dim = latent_dim,
    model_name = "eng-cn_translator_v1"
    )
eng_cn_translator.summary()

### Predict the model

In [None]:
from tensorflow.keras.models import load_model

# load pre-trained model
eng_cn_translator = load_model('/content/drive/MyDrive/Colab Notebooks/eng-cmn project/Save_model/eng-cn_translator_v2.keras')

In [None]:
# predict model
trans_seqs = eng_cn_translator.predict(
                X_test,
                batch_size = 60,
                verbose = 1,
                #use_multiprocessing = True
            )
print(trans_seqs[0])

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 249ms/step
[[3.2128755e-10 9.9999803e-01 3.4186824e-12 ... 1.4705118e-15
  5.8408575e-18 1.6769500e-16]
 [3.1389467e-11 4.7334123e-11 6.0229860e-10 ... 8.9220291e-14
  1.5915183e-14 5.7733267e-12]
 [3.7189602e-11 4.2853042e-13 3.0316141e-10 ... 4.4653253e-14
  6.2684561e-17 2.3180808e-12]
 ...
 [9.9999791e-01 2.2203528e-08 5.8465798e-07 ... 5.8586985e-14
  6.7402501e-17 1.7426949e-12]
 [9.9999791e-01 2.2180460e-08 5.8612119e-07 ... 5.8399554e-14
  6.7350329e-17 1.7424256e-12]
 [9.9999791e-01 2.2165869e-08 5.8596862e-07 ... 5.8348779e-14
  6.7333369e-17 1.7418043e-12]]


In [None]:
# 使用 zip 函數對調
reverse_src_vocab_dict = dict(zip(src_vocab_dict.values(), src_vocab_dict.keys()))
reverse_tgt_vocab_dict = dict(zip(tgt_vocab_dict.values(), tgt_vocab_dict.keys()))

In [None]:
def pred_seq(model, single_seq_pair, reverse_tgt_vocab_dict):
    """
    Predicts a single sentence
    ---------------------------
    single_seq_pair:
        sequence pair that is made up of only one source sequence and one target sequence [(src_max_seq_length, ), (tgt_max_seq_length, )]
        type: list of NumPy arrays
    """
    # print("raw prediction: ", model.predict(single_seq_pair))
    # model gives a one-hot encoded array
    pred = model.predict(single_seq_pair)[0]
    # turns into label encoded array (word_id's)
    pred_le = [np.argmax(oneHot_vec) for oneHot_vec in pred]
    # print("pred_le: ", pred_le)
    pred_tokens = []
    for id in pred_le:
        try:
            word = reverse_tgt_vocab_dict[id]
            pred_tokens.append(word)
        except KeyError:
            break
    return ' '.join(pred_tokens)



In [None]:
# predict the 5th sentence in X_test
for i in range(10):
# ground truth sentences
  print("actual source sentence: {}".format([reverse_src_vocab_dict[id] for id in X_test[0][i] if id != 0]))
  print("actual target sentence: {}".format([reverse_tgt_vocab_dict[id] for id in X_test[1][i] if id != 0]))
  print("predicted target sentence: {}".format(pred_seq(eng_cn_translator, [X_test[0][i:i+1], X_test[1][i:i+1]], reverse_tgt_vocab_dict)))
  print("-" * 100)

actual source sentence: ['<start>', 'how', 'much', 'did', 'this', 'cost', '?', '<end>']
actual target sentence: ['<start>', '多', '少', '錢', '？', '<end>']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
predicted target sentence: <start> 多 少 錢 ？ <end>
----------------------------------------------------------------------------------------------------
actual source sentence: ['<start>', 'keep', 'the', 'dog', 'out', '.', '<end>']
actual target sentence: ['<start>', '别', '让', '狗', '进', '来', '。', '<end>']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
predicted target sentence: <start> 别 让 狗 进 来 。 <end>
----------------------------------------------------------------------------------------------------
actual source sentence: ['<start>', 'there', 'were', 'no', 'mistakes', '.', '<end>']
actual target sentence: ['<start>', '没', '有', '错', '误', '。', '<end>']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
predicted target s

### Use BLEU score to evaluate model


In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
X_train[0][1:2]

array([[  1, 236, 402,   7,  14,   5,   2,   0,   0,   0,   0]],
      dtype=int32)

In [None]:
def eval_NMT(model, X_input, seq_pairs_input, reverse_tgt_vocab_dict):
    """
    Evaluates trained NMT model on a given dataset
    ------------------------------------------------
    X_input:
        [a few enc_inputs, a few dec_inputs]
        date type: numpy array of shape: [(n_sentences, src_max_seq_length), (n_sentences, tgt_max_seq_length)]
    seq_pairs_input:
        source and target sentences
        data type: list of list of strings
    """
    # Step 0: Check shape and specify max_seq_length
    print("shape of src_seqs: [{}, {}]".format(X_input[0].shape, X_input[1].shape)) # [(8000, 13), (8000, 22)]
    true, predicted = [], []

    src_max_seq_length = X_input[0].shape[1]  #11
    tgt_max_seq_length = X_input[1].shape[1]  #22

    # Step 1: Translate each sentence
    for i in range(10): # 8000
        # Step 2: Prepare training data of one sample (current sentence)
        single_seq_pair = [X_input[0][i:i+1], X_input[1][i:i+1]]   #(英文,中文) (data,label)
        # src_seq shape: [(?, 11), (?, 22)]
        # Step 3: Predict a single sample and creates a string of tokens
        translated_sentence = pred_seq(model, single_seq_pair, reverse_tgt_vocab_dict)


        # Step 4: Collect ground truth sentences and predicted sentences
        src_sentence = [reverse_src_vocab_dict[id] for id in X_input[0][i] if id != 0]
        tgt_sentence = [reverse_tgt_vocab_dict[id] for id in X_input[1][i] if id != 0]

        # lists translation results of first five sentences
        if i < 5:
            print("source: {}\ntarget: {}\ntranslated: {}".format(src_sentence, tgt_sentence, translated_sentence))

        true.append([tgt_sentence])  #label     # 因為參考翻譯可以有多個版本，所以要加一層外部列表
        predicted.append(translated_sentence.split())  #predict
    #print('true:',true)
    #print('predict:',predicted)

    # Step 5: Calculate corpus BLEU scores on the dataset X_input
    ## Individual n-gram scores

    print("Individual 1-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (1, 0, 0, 0))))
    print("Individual 2-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (0, 1, 0, 0))))
    print("Individual 3-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (1, 1, 1, 0))))
    print("Individual 4-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (1, 0, 0, 1))))

    ## Cumulative n-gram scores
    print("Cumulative 1-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (1, 0, 0, 0))))
    print("Cumulative 2-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (.5, .5, 0, 0))))
    print("Cumulative 3-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (.33, .33, .33, 0))))
    print("Cumulative 4-gram score: {:.6f}".format(corpus_bleu(true, predicted, weights = (.25, .25, .25, .25))))


    bleu_score = corpus_bleu(true, predicted)
    print("Corpus BLEU score: {:.6f}".format(bleu_score))

# evaluate model on training dataset
eval_NMT(eng_cn_translator, X_test, seq_pairs, reverse_tgt_vocab_dict)


# evaluate model on training dataset
#eval_NMT(eng_cn_translator, X_test, seq_pairs, reverse_tgt_vocab_dict)

shape of src_seqs: [(2000, 11), (2000, 22)]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
source: ['<start>', 'how', 'much', 'did', 'this', 'cost', '?', '<end>']
target: ['<start>', '多', '少', '錢', '？', '<end>']
translated: <start> 多 少 錢 ？ <end>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
source: ['<start>', 'keep', 'the', 'dog', 'out', '.', '<end>']
target: ['<start>', '别', '让', '狗', '进', '来', '。', '<end>']
translated: <start> 别 让 狗 进 来 。 <end>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
source: ['<start>', 'there', 'were', 'no', 'mistakes', '.', '<end>']
target: ['<start>', '没', '有', '错', '误', '。', '<end>']
translated: <start> 没 有 错 误 。 <end>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
source: ['<start>', 'there', 'isn', "'t", 'any', 'soap', '.', '<end>']
target: ['<start>', '沒', '有', '任', '何', '肥', '皂', '。', '<end>']
translated: <start> 沒 有 任 何 肥 通 。 <end>
[1m1/1[0m [32m━━