In [1]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import matplotlib.pyplot as plt
import os
import imageio
import requests
from zipfile import ZipFile

In [34]:
# Mode can be either 'train' or 'infer'
# Set to 'infer' will skip the training
MODE = 'train'
URL = 'http://www.manythings.org/anki/fra-eng.zip'
FILENAME = 'yue-eng.zip'
BATCH_SIZE = 32
EMBEDDING_SIZE = 256
RNN_SIZE = 512
NUM_EPOCHS = 300

In [35]:
# Set the score function to compute alignment vectors
# Can choose between 'dot', 'general' or 'concat'
ATTENTION_FUNC = 'concat'

In [36]:
def maybe_download_and_read_file(url, filename):
    if not os.path.exists(filename):
        session = requests.Session()
        response = session.get(url, stream=True)

        CHUNK_SIZE = 32768
        with open(filename, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk:
                    f.write(chunk)

    zipf = ZipFile(filename)
    filename = zipf.namelist()
    with zipf.open('yue.txt') as f:
        lines = f.read()

    return lines

In [37]:
lines = maybe_download_and_read_file(URL, FILENAME)
lines = lines.decode('utf-8')

In [38]:
raw_data = []
for line in lines.split('\n'):
    raw_data.append(line.split('\t'))

print(raw_data[-5:])

[['Why do you insist on paying for your school expenses yourself, when your parents are willing to give you financial support?', '你阿爸阿媽都肯喺財政上支持你啦，點解你仲係都要自己俾學費啫？', 'CC-BY 2.0 (France) Attribution: tatoeba.org #325648 (CK) & #5777423 (nickyeow)'], ["You only notice those who are loud and obnoxious. You don't notice the majority who are just normal people and keep to themselves.", '你淨係會留意到嗰啲嘈嘈閉、好乞人憎嘅人，唔會留意到絕大部份安安靜靜咁自己做自己嘢嘅正常人。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #6045678 (mailohilohi) & #6488761 (nickyeow)'], ['If you take a child outside and point at the moon, they will look at the moon. If you do the same thing with a dog, it will look at your finger.', '如果你帶個細路出街，指住個月亮，佢會望住個月亮。如果你對隻狗做同樣嘅嘢，佢會望住你隻手指。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #5264594 (Cithara) & #5798410 (nickyeow)'], ["The people here are particular about what they eat, so even if a restaurant is inexpensive, it'll soon go out of business if the food doesn't taste good.", '呢到嘅人對食好講究，所以就算係幾平嘅餐廳都好，如果啲嘢食唔好食

In [39]:
# The last element is empty, so omit it
raw_data = raw_data[:-1]
print(raw_data[-5:])

[["When I was studying to become a lawyer, my teachers told me to never ask a question that I didn't know the answer to.", '我讀法律嗰陣，啲老師教我唔好問啲自己唔知道答案嘅問題。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #1989692 (CK) & #5752926 (nickyeow)'], ['Why do you insist on paying for your school expenses yourself, when your parents are willing to give you financial support?', '你阿爸阿媽都肯喺財政上支持你啦，點解你仲係都要自己俾學費啫？', 'CC-BY 2.0 (France) Attribution: tatoeba.org #325648 (CK) & #5777423 (nickyeow)'], ["You only notice those who are loud and obnoxious. You don't notice the majority who are just normal people and keep to themselves.", '你淨係會留意到嗰啲嘈嘈閉、好乞人憎嘅人，唔會留意到絕大部份安安靜靜咁自己做自己嘢嘅正常人。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #6045678 (mailohilohi) & #6488761 (nickyeow)'], ['If you take a child outside and point at the moon, they will look at the moon. If you do the same thing with a dog, it will look at your finger.', '如果你帶個細路出街，指住個月亮，佢會望住個月亮。如果你對隻狗做同樣嘅嘢，佢會望住你隻手指。', 'CC-BY 2.0 (France) Attribution: tatoeba.o

In [40]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [41]:
def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

In [42]:
raw_data_en, raw_data_cantonese, info = list(zip(*raw_data))
raw_data_en = [normalize_string(data) for data in raw_data_en]
raw_data_cantonese_in = ['<start> ' + normalize_string(data) for data in raw_data_cantonese]
raw_data_cantonese_out = [normalize_string(data) + ' <end>' for data in raw_data_cantonese]

In [43]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(raw_data_en)
data_en = en_tokenizer.texts_to_sequences(raw_data_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding='post')
print('English sequences')
print(data_en[:2])

English sequences
[[ 960    1    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]
 [1381  135    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]]


In [44]:
cantonese_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
cantonese_tokenizer.fit_on_texts(raw_data_cantonese_in)
cantonese_tokenizer.fit_on_texts(raw_data_cantonese_out)
data_cantonese_in = cantonese_tokenizer.texts_to_sequences(raw_data_cantonese_in)
data_cantonese_in = tf.keras.preprocessing.sequence.pad_sequences(data_cantonese_in, padding='post')
print('cantonese input sequences')
print(data_cantonese_in[:2])

cantonese input sequences
[[1 0 0 0]
 [1 0 0 0]]


In [45]:
data_cantonese_out = fr_tokenizer.texts_to_sequences(raw_data_cantonese_out)
data_cantonese_out = tf.keras.preprocessing.sequence.pad_sequences(data_cantonese_out,
                                                            padding='post')
print('French output sequences')
print(data_cantonese_out[:2])

French output sequences
[[2 0 0 0]
 [2 0 0 0]]


In [46]:
dataset = tf.data.Dataset.from_tensor_slices(
    (data_en, data_cantonese_in, data_cantonese_out))
dataset = dataset.shuffle(len(raw_data_en)).batch(
    BATCH_SIZE, drop_remainder=True)

In [47]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, rnn_size):
        super(Encoder, self).__init__()
        self.rnn_size = rnn_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            rnn_size, return_sequences=True, return_state=True)

    def call(self, sequence, states):
        embed = self.embedding(sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states)

        return output, state_h, state_c

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.rnn_size]),
                tf.zeros([batch_size, self.rnn_size]))

In [48]:
en_vocab_size = len(en_tokenizer.word_index) + 1

encoder = Encoder(en_vocab_size, EMBEDDING_SIZE, RNN_SIZE)

In [49]:
class LuongAttention(tf.keras.Model):
    def __init__(self, rnn_size, attention_func):
        super(LuongAttention, self).__init__()
        self.attention_func = attention_func

        if attention_func not in ['dot', 'general', 'concat']:
            raise ValueError(
                'Unknown attention score function! Must be either dot, general or concat.')

        if attention_func == 'general':
            # General score function
            self.wa = tf.keras.layers.Dense(rnn_size)
        elif attention_func == 'concat':
            # Concat score function
            self.wa = tf.keras.layers.Dense(rnn_size, activation='tanh')
            self.va = tf.keras.layers.Dense(1)

    def call(self, decoder_output, encoder_output):
        if self.attention_func == 'dot':
            # Dot score function: decoder_output (dot) encoder_output
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, encoder_output, transpose_b=True)
        elif self.attention_func == 'general':
            # General score function: decoder_output (dot) (Wa (dot) encoder_output)
            # decoder_output has shape: (batch_size, 1, rnn_size)
            # encoder_output has shape: (batch_size, max_len, rnn_size)
            # => score has shape: (batch_size, 1, max_len)
            score = tf.matmul(decoder_output, self.wa(
                encoder_output), transpose_b=True)
        elif self.attention_func == 'concat':
            # Concat score function: va (dot) tanh(Wa (dot) concat(decoder_output + encoder_output))
            # Decoder output must be broadcasted to encoder output's shape first
            decoder_output = tf.tile(
                decoder_output, [1, encoder_output.shape[1], 1])

            # Concat => Wa => va
            # (batch_size, max_len, 2 * rnn_size) => (batch_size, max_len, rnn_size) => (batch_size, max_len, 1)
            score = self.va(
                self.wa(tf.concat((decoder_output, encoder_output), axis=-1)))

            # Transpose score vector to have the same shape as other two above
            # (batch_size, max_len, 1) => (batch_size, 1, max_len)
            score = tf.transpose(score, [0, 2, 1])

        # alignment a_t = softmax(score)
        alignment = tf.nn.softmax(score, axis=2)

        # context vector c_t is the weighted average sum of encoder output
        context = tf.matmul(alignment, encoder_output)

        return context, alignment

In [50]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, rnn_size, attention_func):
        super(Decoder, self).__init__()
        self.attention = LuongAttention(rnn_size, attention_func)
        self.rnn_size = rnn_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            rnn_size, return_sequences=True, return_state=True)
        self.wc = tf.keras.layers.Dense(rnn_size, activation='tanh')
        self.ws = tf.keras.layers.Dense(vocab_size)

    def call(self, sequence, state, encoder_output):
        # Remember that the input to the decoder
        # is now a batch of one-word sequences,
        # which means that its shape is (batch_size, 1)
        embed = self.embedding(sequence)

        # Therefore, the lstm_out has shape (batch_size, 1, rnn_size)
        lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)

        # Use self.attention to compute the context and alignment vectors
        # context vector's shape: (batch_size, 1, rnn_size)
        # alignment vector's shape: (batch_size, 1, source_length)
        context, alignment = self.attention(lstm_out, encoder_output)

        # Combine the context vector and the LSTM output
        # Before combined, both have shape of (batch_size, 1, rnn_size),
        # so let's squeeze the axis 1 first
        # After combined, it will have shape of (batch_size, 2 * rnn_size)
        lstm_out = tf.concat(
            [tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)

        # lstm_out now has shape (batch_size, rnn_size)
        lstm_out = self.wc(lstm_out)

        # Finally, it is converted back to vocabulary space: (batch_size, vocab_size)
        logits = self.ws(lstm_out)

        return logits, state_h, state_c, alignment

In [51]:
cantonese_vocab_size = len(cantonese_tokenizer.word_index) + 1

decoder = Decoder(cantonese_vocab_size, EMBEDDING_SIZE, RNN_SIZE, ATTENTION_FUNC)

In [52]:
# These lines can be used for debugging purpose
# Or can be seen as a way to build the models
initial_state = encoder.init_states(1)
encoder_outputs = encoder(tf.constant([[1]]), initial_state)
decoder_outputs = decoder(tf.constant(
    [[1]]), encoder_outputs[1:], encoder_outputs[0])

In [53]:
def loss_func(targets, logits):
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss

In [54]:
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)

In [55]:
def predict(test_source_text=None):
    if test_source_text is None:
        test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

    de_input = tf.constant([[fr_tokenizer.word_index['<start>']]])
    de_state_h, de_state_c = en_outputs[1:]
    out_words = []
    alignments = []

    while True:
        de_output, de_state_h, de_state_c, alignment = decoder(
            de_input, (de_state_h, de_state_c), en_outputs[0])
        de_input = tf.expand_dims(tf.argmax(de_output, -1), 0)
        out_words.append(fr_tokenizer.index_word[de_input.numpy()[0][0]])

        alignments.append(alignment.numpy())

        if out_words[-1] == '<end>' or len(out_words) >= 20:
            break

    print(' '.join(out_words))
    return np.array(alignments), test_source_text.split(' '), out_words

In [56]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
    loss = 0
    with tf.GradientTape() as tape:
        en_outputs = encoder(source_seq, en_initial_states)
        en_states = en_outputs[1:]
        de_state_h, de_state_c = en_states

        # We need to create a loop to iterate through the target sequences
        for i in range(target_seq_out.shape[1]):
            # Input to the decoder must have shape of (batch_size, length)
            # so we need to expand one dimension
            decoder_in = tf.expand_dims(target_seq_in[:, i], 1)
            logit, de_state_h, de_state_c, _ = decoder(
                decoder_in, (de_state_h, de_state_c), en_outputs[0])

            # The loss is now accumulated through the whole batch
            loss += loss_func(target_seq_out[:, i], logit)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss / target_seq_out.shape[1]

In [57]:
if not os.path.exists('checkpoints_luong/encoder'):
    os.makedirs('checkpoints_luong/encoder')
if not os.path.exists('checkpoints_luong/decoder'):
    os.makedirs('checkpoints_luong/decoder')

In [58]:
# Uncomment these lines for inference mode
encoder_checkpoint = tf.train.latest_checkpoint('checkpoints_luong/encoder')
decoder_checkpoint = tf.train.latest_checkpoint('checkpoints_luong/decoder')

In [59]:
if encoder_checkpoint is not None and decoder_checkpoint is not None:
    encoder.load_weights(encoder_checkpoint)
    decoder.load_weights(decoder_checkpoint)

if MODE == 'train':
    for e in range(NUM_EPOCHS):
        en_initial_states = encoder.init_states(BATCH_SIZE)
        encoder.save_weights(
            'checkpoints_luong/encoder/encoder_{}.h5'.format(e + 1))
        decoder.save_weights(
            'checkpoints_luong/decoder/decoder_{}.h5'.format(e + 1))
        for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
            loss = train_step(source_seq, target_seq_in,
                              target_seq_out, en_initial_states)

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    e + 1, batch, loss.numpy()))

        try:
            predict()

            predict("How are you today ?")
        except Exception:
            continue

Epoch 1 Batch 0 Loss 0.9257
Are you going to eat all of that yourself ?
[[29, 3, 94, 5, 102, 63, 16, 15, 192, 6]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 2 Batch 0 Loss 0.0258
You can take a horse to water but you can t make him drink .
[[3, 24, 80, 7, 659, 5, 199, 97, 3, 24, 9, 150, 48, 234, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 3 Batch 0 Loss 0.0048
Your composition was good except for the spelling .
[[25, 2278, 30, 70, 764, 19, 4, 1354, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 4 Batch 0 Loss 0.0014
He forgot to lock the door .
[[10, 319, 5, 1075, 4, 206, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 5 Batch 0 Loss 0.0008
I ve been looking for a job a long time .
[[2, 77, 107, 211, 19, 7, 165, 7, 154, 46, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 6 Batch 0 Loss 0.0019
We took turns driving the car .
[[26, 188, 1137, 691, 4, 119, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 

Epoch 52 Batch 0 Loss 0.0001
Is she a taxi driver ?
[[8, 33, 7, 446, 560, 6]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 53 Batch 0 Loss 0.0000
You don t understand .
[[3, 20, 9, 329, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 54 Batch 0 Loss 0.0000
That man knows how to get on the president s good side .
[[15, 143, 688, 42, 5, 49, 23, 4, 822, 14, 70, 750, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 55 Batch 0 Loss 0.0001
His car has just been repaired .
[[37, 119, 54, 98, 107, 1148, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 56 Batch 0 Loss 0.0001
I liked walking alone on the deserted beach .
[[2, 1293, 945, 390, 23, 4, 2096, 579, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 57 Batch 0 Loss 0.0000
Guess how tall I am .
[[787, 42, 788, 2, 123, 1]]
<end>
How are you today ?
[[42, 29, 3, 118, 6]]
<end>
Epoch 58 Batch 0 Loss 0.0000
He is always giving presents to his wife .
[[10, 8, 103, 1983

KeyboardInterrupt: 

In [None]:
if not os.path.exists('heatmap'):
    os.makedirs('heatmap')

test_sents = (
    'What a ridiculous concept!',
    'Your idea is not entirely crazy.',
    "A man's worth lies in what he is.",
    'What he did is very wrong.',
    "All three of you need to do that.",
    "Are you giving me another chance?",
    "Both Tom and Mary work as models.",
    "Can I have a few minutes, please?",
    "Could you close the door, please?",
    "Did you plant pumpkins this year?",
    "Do you ever study in the library?",
    "Don't be deceived by appearances.",
    "Excuse me. Can you speak English?",
    "Few people know the true meaning.",
    "Germany produced many scientists.",
    "Guess whose birthday it is today.",
    "He acted like he owned the place.",
    "Honesty will pay in the long run.",
    "How do we know this isn't a trap?",
    "I can't believe you're giving up.",
)

filenames = []

for i, test_sent in enumerate(test_sents):
    test_sequence = normalize_string(test_sent)
    alignments, source, prediction = predict(test_sequence)
    attention = np.squeeze(alignments, (1, 2))
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='jet')
    ax.set_xticklabels([''] + source, rotation=90)
    ax.set_yticklabels([''] + prediction)

    filenames.append('heatmap/test_{}.png'.format(i))
    plt.savefig('heatmap/test_{}.png'.format(i))
    plt.close()

with imageio.get_writer('translation_heatmaps.gif', mode='I', duration=2) as writer:
    for filename in filenames:
        image = imageio.imread(filename)
        writer.append_data(image)

In [None]:
test_sents = (
    'What a ridiculous concept!',
    'Your idea is not entirely crazy.',
    "A man's worth lies in what he is.",
    'What he did is very wrong.',
    "All three of you need to do that.",
    "Are you giving me another chance?",
    "Both Tom and Mary work as models.",
    "Can I have a few minutes, please?",
    "Could you close the door, please?",
    "Did you plant pumpkins this year?",
    "Do you ever study in the library?",
    "Don't be deceived by appearances.",
    "Excuse me. Can you speak English?",
    "Few people know the true meaning.",
    "Germany produced many scientists.",
    "Guess whose birthday it is today.",
    "He acted like he owned the place.",
    "Honesty will pay in the long run.",
    "How do we know this isn't a trap?",
    "I can't believe you're giving up.",
)

for test_sent in test_sents:
    test_sequence = normalize_string(test_sent)
    predict(test_sequence)