Skip to content
This repository has been archived by the owner on Mar 3, 2024. It is now read-only.

Commit

Permalink
Add translation example to README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
CyberZHG committed May 21, 2019
1 parent b5aa38e commit c67f172
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 2 deletions.
82 changes: 82 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,85 @@ token_dict_rev = {v: k for k, v in token_dict.items()}
for i in range(len(decoded)):
print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
```

### Translation

```python
import numpy as np
from keras_transformer import get_model, decode

source_tokens = [
'i need more power'.split(' '),
'eat jujube and pill'.split(' '),
]
target_tokens = [
list('我要更多的抛瓦'),
list('吃枣💊'),
]

# Generate dictionaries
def build_token_dict(token_list):
token_dict = {
'<PAD>': 0,
'<START>': 1,
'<END>': 2,
}
for tokens in token_list:
for token in tokens:
if token not in token_dict:
token_dict[token] = len(token_dict)
return token_dict

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# Padding
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# Build & fit model
model = get_model(
token_num=max(len(source_token_dict), len(target_token_dict)),
embed_dim=32,
encoder_num=2,
decoder_num=2,
head_num=4,
hidden_dim=128,
dropout_rate=0.05,
use_same_embed=False, # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

model.fit(
x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
y=np.array(decode_output * 1024),
epochs=10,
batch_size=32,
)

# Predict
decoded = decode(
model,
encode_input,
start_token=target_token_dict['<START>'],
end_token=target_token_dict['<END>'],
pad_token=target_token_dict['<PAD>'],
)
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))
```
84 changes: 83 additions & 1 deletion README.zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Coverage](https://coveralls.io/repos/github/CyberZHG/keras-transformer/badge.svg?branch=master)](https://coveralls.io/github/CyberZHG/keras-transformer)
[![Version](https://img.shields.io/pypi/v/keras-transformer.svg)](https://pypi.org/project/keras-transformer/)

[Transformer](https://arxiv.org/pdf/1706.03762.pdf)实现
[Transformer](https://arxiv.org/pdf/1706.03762.pdf)的实现

## 安装

Expand Down Expand Up @@ -91,3 +91,85 @@ token_dict_rev = {v: k for k, v in token_dict.items()}
for i in range(len(decoded)):
print(' '.join(map(lambda x: token_dict_rev[x], decoded[i][1:-1])))
```

### 翻译

```python
import numpy as np
from keras_transformer import get_model, decode

source_tokens = [
'i need more power'.split(' '),
'eat jujube and pill'.split(' '),
]
target_tokens = [
list('我要更多的抛瓦'),
list('吃枣💊'),
]

# 生成不同语言的词典
def build_token_dict(token_list):
token_dict = {
'<PAD>': 0,
'<START>': 1,
'<END>': 2,
}
for tokens in token_list:
for token in tokens:
if token not in token_dict:
token_dict[token] = len(token_dict)
return token_dict

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# 添加特殊符号
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# 补齐长度
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# 构建和训练模型
model = get_model(
token_num=max(len(source_token_dict), len(target_token_dict)),
embed_dim=32,
encoder_num=2,
decoder_num=2,
head_num=4,
hidden_dim=128,
dropout_rate=0.05,
use_same_embed=False, # 不同语言需要使用不同的词嵌入
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

model.fit(
x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
y=np.array(decode_output * 1024),
epochs=10,
batch_size=32,
)

# 预测过程
decoded = decode(
model,
encode_input,
start_token=target_token_dict['<START>'],
end_token=target_token_dict['<END>'],
pad_token=target_token_dict['<PAD>'],
)
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))
```
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setup(
name='keras-transformer',
version='0.23.0',
version='0.24.0',
packages=find_packages(),
url='https://github.com/CyberZHG/keras-transformer',
license='MIT',
Expand Down
86 changes: 86 additions & 0 deletions tests/test_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# encoding: utf-8
from __future__ import unicode_literals

import unittest
import numpy as np
from keras_transformer import get_model, decode


class TestTranslate(unittest.TestCase):

@staticmethod
def _build_token_dict(token_list):
token_dict = {
'<PAD>': 0,
'<START>': 1,
'<END>': 2,
}
for tokens in token_list:
for token in tokens:
if token not in token_dict:
token_dict[token] = len(token_dict)
return token_dict

def test_translate(self):
source_tokens = [
'i need more power'.split(' '),
'eat jujube and pill'.split(' '),
]
target_tokens = [
list('我要更多的抛瓦'),
list('吃枣💊'),
]

# Generate dictionaries
source_token_dict = self._build_token_dict(source_tokens)
target_token_dict = self._build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# Padding
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# Build & fit model
model = get_model(
token_num=max(len(source_token_dict), len(target_token_dict)),
embed_dim=32,
encoder_num=2,
decoder_num=2,
head_num=4,
hidden_dim=128,
dropout_rate=0.05,
use_same_embed=False, # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()
model.fit(
x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
y=np.array(decode_output * 1024),
epochs=10,
batch_size=32,
)

# Predict
decoded = decode(
model,
encode_input,
start_token=target_token_dict['<START>'],
end_token=target_token_dict['<END>'],
pad_token=target_token_dict['<PAD>'],
)
for i in range(len(encode_input)):
predicted = ''.join(map(lambda x: target_token_dict_inv[x], decoded[i][1:-1]))
self.assertEqual(''.join(target_tokens[i]), predicted)

0 comments on commit c67f172

Please sign in to comment.