Add translation example to README.md

CyberZHG · May 21, 2019 · c67f172 · c67f172
1 parent b5aa38e
commit c67f172
Show file tree

Hide file tree

Showing 4 changed files with 252 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -91,3 +91,85 @@ token_dict_rev = {v: k for k, v in token_dict.items()}
 for i in range(len(decoded)):
     print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
 ```
+
+### Translation
+
+```python
+import numpy as np
+from keras_transformer import get_model, decode
+
+source_tokens = [
+    'i need more power'.split(' '),
+    'eat jujube and pill'.split(' '),
+]
+target_tokens = [
+    list('我要更多的抛瓦'),
+    list('吃枣💊'),
+]
+
+# Generate dictionaries
+def build_token_dict(token_list):
+    token_dict = {
+        '<PAD>': 0,
+        '<START>': 1,
+        '<END>': 2,
+    }
+    for tokens in token_list:
+        for token in tokens:
+            if token not in token_dict:
+                token_dict[token] = len(token_dict)
+    return token_dict
+
+source_token_dict = build_token_dict(source_tokens)
+target_token_dict = build_token_dict(target_tokens)
+target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
+
+# Add special tokens
+encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
+decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
+output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
+
+# Padding
+source_max_len = max(map(len, encode_tokens))
+target_max_len = max(map(len, decode_tokens))
+
+encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
+decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
+output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
+
+encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
+decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
+decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
+
+# Build & fit model
+model = get_model(
+    token_num=max(len(source_token_dict), len(target_token_dict)),
+    embed_dim=32,
+    encoder_num=2,
+    decoder_num=2,
+    head_num=4,
+    hidden_dim=128,
+    dropout_rate=0.05,
+    use_same_embed=False,  # Use different embeddings for different languages
+)
+model.compile('adam', 'sparse_categorical_crossentropy')
+model.summary()
+
+model.fit(
+    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
+    y=np.array(decode_output * 1024),
+    epochs=10,
+    batch_size=32,
+)
+
+# Predict
+decoded = decode(
+    model,
+    encode_input,
+    start_token=target_token_dict['<START>'],
+    end_token=target_token_dict['<END>'],
+    pad_token=target_token_dict['<PAD>'],
+)
+print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
+print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))
+```
diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -4,7 +4,7 @@
 [![Coverage](https://coveralls.io/repos/github/CyberZHG/keras-transformer/badge.svg?branch=master)](https://coveralls.io/github/CyberZHG/keras-transformer)
 [![Version](https://img.shields.io/pypi/v/keras-transformer.svg)](https://pypi.org/project/keras-transformer/)
 
-[Transformer](https://arxiv.org/pdf/1706.03762.pdf)实现。
+[Transformer](https://arxiv.org/pdf/1706.03762.pdf)的实现。
 
 ## 安装
 
@@ -91,3 +91,85 @@ token_dict_rev = {v: k for k, v in token_dict.items()}
 for i in range(len(decoded)):
     print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
 ```
+
+### 翻译
+
+```python
+import numpy as np
+from keras_transformer import get_model, decode
+
+source_tokens = [
+    'i need more power'.split(' '),
+    'eat jujube and pill'.split(' '),
+]
+target_tokens = [
+    list('我要更多的抛瓦'),
+    list('吃枣💊'),
+]
+
+# 生成不同语言的词典
+def build_token_dict(token_list):
+    token_dict = {
+        '<PAD>': 0,
+        '<START>': 1,
+        '<END>': 2,
+    }
+    for tokens in token_list:
+        for token in tokens:
+            if token not in token_dict:
+                token_dict[token] = len(token_dict)
+    return token_dict
+
+source_token_dict = build_token_dict(source_tokens)
+target_token_dict = build_token_dict(target_tokens)
+target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
+
+# 添加特殊符号
+encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
+decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
+output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
+
+# 补齐长度
+source_max_len = max(map(len, encode_tokens))
+target_max_len = max(map(len, decode_tokens))
+
+encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
+decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
+output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
+
+encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
+decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
+decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
+
+# 构建和训练模型
+model = get_model(
+    token_num=max(len(source_token_dict), len(target_token_dict)),
+    embed_dim=32,
+    encoder_num=2,
+    decoder_num=2,
+    head_num=4,
+    hidden_dim=128,
+    dropout_rate=0.05,
+    use_same_embed=False,  # 不同语言需要使用不同的词嵌入
+)
+model.compile('adam', 'sparse_categorical_crossentropy')
+model.summary()
+
+model.fit(
+    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
+    y=np.array(decode_output * 1024),
+    epochs=10,
+    batch_size=32,
+)
+
+# 预测过程
+decoded = decode(
+    model,
+    encode_input,
+    start_token=target_token_dict['<START>'],
+    end_token=target_token_dict['<END>'],
+    pad_token=target_token_dict['<PAD>'],
+)
+print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
+print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))
+```
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name='keras-transformer',
-    version='0.23.0',
+    version='0.24.0',
     packages=find_packages(),
     url='https://github.com/CyberZHG/keras-transformer',
     license='MIT',

diff --git a/tests/test_translate.py b/tests/test_translate.py
@@ -0,0 +1,86 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import unittest
+import numpy as np
+from keras_transformer import get_model, decode
+
+
+class TestTranslate(unittest.TestCase):
+
+    @staticmethod
+    def _build_token_dict(token_list):
+        token_dict = {
+            '<PAD>': 0,
+            '<START>': 1,
+            '<END>': 2,
+        }
+        for tokens in token_list:
+            for token in tokens:
+                if token not in token_dict:
+                    token_dict[token] = len(token_dict)
+        return token_dict
+
+    def test_translate(self):
+        source_tokens = [
+            'i need more power'.split(' '),
+            'eat jujube and pill'.split(' '),
+        ]
+        target_tokens = [
+            list('我要更多的抛瓦'),
+            list('吃枣💊'),
+        ]
+
+        # Generate dictionaries
+        source_token_dict = self._build_token_dict(source_tokens)
+        target_token_dict = self._build_token_dict(target_tokens)
+        target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
+
+        # Add special tokens
+        encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
+        decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
+        output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
+
+        # Padding
+        source_max_len = max(map(len, encode_tokens))
+        target_max_len = max(map(len, decode_tokens))
+
+        encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
+        decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
+        output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
+
+        encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
+        decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
+        decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
+
+        # Build & fit model
+        model = get_model(
+            token_num=max(len(source_token_dict), len(target_token_dict)),
+            embed_dim=32,
+            encoder_num=2,
+            decoder_num=2,
+            head_num=4,
+            hidden_dim=128,
+            dropout_rate=0.05,
+            use_same_embed=False,  # Use different embeddings for different languages
+        )
+        model.compile('adam', 'sparse_categorical_crossentropy')
+        model.summary()
+        model.fit(
+            x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
+            y=np.array(decode_output * 1024),
+            epochs=10,
+            batch_size=32,
+        )
+
+        # Predict
+        decoded = decode(
+            model,
+            encode_input,
+            start_token=target_token_dict['<START>'],
+            end_token=target_token_dict['<END>'],
+            pad_token=target_token_dict['<PAD>'],
+        )
+        for i in range(len(encode_input)):
+            predicted = ''.join(map(lambda x: target_token_dict_inv[x], decoded[i][1:-1]))
+            self.assertEqual(''.join(target_tokens[i]), predicted)