In [0]:
%tensorflow_version 1.x 

In [0]:
# 下載想要的語言版本 -q: 安靜模式
!wget -q http://www.manythings.org/anki/cmn-eng.zip
# 解壓縮zip -o: 覆蓋原本
!unzip -o cmn-eng.zip

Archive:  cmn-eng.zip
  inflating: cmn.txt                 
  inflating: _about.txt              


In [0]:
import pandas as pd
df = pd.read_csv("cmn.txt", 
                 sep="\t",
                 header=None)
df.columns = ["English", "Chinese", "By"]
df

Unnamed: 0,English,Chinese,By
0,Hi.,嗨。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Hi.,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run.,你用跑的。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
3,Wait!,等等！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
4,Hello!,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
...,...,...,...
21200,"Last year in the Philippines, earthquakes and ...",去年在菲律宾，地震和海啸造成了超过6000人的死亡。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
21201,My mother speaks French better than my father ...,我母亲的法语比我父亲的英语要好，所以他们通常用法语交流。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
21202,"Tom didn't know how to translate the word ""com...",汤姆不知如何翻译“计算机”一词，因为同他谈话的人从未见过一台。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
21203,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [0]:
# 使用opencc將簡中翻譯成繁中
!pip install opencc-python-reimplemented

Collecting opencc-python-reimplemented
[?25l  Downloading https://files.pythonhosted.org/packages/53/0c/c499c86a719c925a08586085a56f92f3235c03ee8b4db2e59c1e9aab3f55/opencc-python-reimplemented-0.1.5.tar.gz (482kB)
[K     |▊                               | 10kB 25.4MB/s eta 0:00:01[K     |█▍                              | 20kB 1.7MB/s eta 0:00:01[K     |██                              | 30kB 2.5MB/s eta 0:00:01[K     |██▊                             | 40kB 1.7MB/s eta 0:00:01[K     |███▍                            | 51kB 2.1MB/s eta 0:00:01[K     |████                            | 61kB 2.5MB/s eta 0:00:01[K     |████▊                           | 71kB 2.9MB/s eta 0:00:01[K     |█████▍                          | 81kB 3.2MB/s eta 0:00:01[K     |██████                          | 92kB 3.6MB/s eta 0:00:01[K     |██████▉                         | 102kB 2.8MB/s eta 0:00:01[K     |███████▌                        | 112kB 2.8MB/s eta 0:00:01[K     |████████▏               

In [0]:
from opencc import OpenCC
t = OpenCC("s2tw")
df["Chinese"] = df["Chinese"].apply(lambda s:t.convert(s))
df = df.drop(["By"], axis=1)
df.to_csv("en_zh.csv")
df

Unnamed: 0,English,Chinese
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Wait!,等等！
4,Hello!,你好。
...,...,...
21200,"Last year in the Philippines, earthquakes and ...",去年在菲律賓，地震和海嘯造成了超過6000人的死亡。
21201,My mother speaks French better than my father ...,我母親的法語比我父親的英語要好，所以他們通常用法語交流。
21202,"Tom didn't know how to translate the word ""com...",湯姆不知如何翻譯“計算機”一詞，因為同他談話的人從未見過一臺。
21203,"Even now, I occasionally think I'd like to see...",即使是現在，我偶爾還是想見到你。不是今天的你，而是我記憶中曾經的你。


In [0]:
# 預處理Step1. 數出有多少種不同的詞彙
inputs = []
targets = []
input_characters = set()
target_characters = set()

for target_text, input_text in zip(df["Chinese"], df["English"]):
    # 這裡比較特別, 在真正最後預測的時候你是喂start token給decoder
    # 再等到decoder輸出end token結束
    # 所以我們自定義 \t 為 start token, \n 為 decode token
    target_text = '\t' + target_text + '\n'
    # encoder 吃的不用加 \t \n
    inputs.append(input_text)
    # decoder 吃的加 \t \n
    targets.append(target_text)
    # 把不同種的詞紀錄, 由於我們使用set, 所以會自動把重複的去掉
    for char in input_text:
        input_characters.add(char)
    for char in target_text:
        target_characters.add(char)


In [0]:
# 把我們的一些參數列出來一下
# 順便先計算最長的輸入和輸出長度, 等等喂給模型
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters) + 1
num_decoder_tokens = len(target_characters) + 1
max_encoder_seq_length = max([len(txt) for txt in inputs])
max_decoder_seq_length = max([len(txt) for txt in targets])

print("資料集筆數:", len(inputs))
print("輸入的所有字彙:", num_encoder_tokens)
print("輸出的所有字彙:", num_decoder_tokens)
print("最長的輸入句子有多長:", max_encoder_seq_length)
print('最長的輸出句子有多長:', max_decoder_seq_length)

資料集筆數: 21205
輸入的所有字彙: 76
輸出的所有字彙: 2761
最長的輸入句子有多長: 163
最長的輸出句子有多長: 46


In [0]:
# 給每一個字彙一個編號, 輸入輸出都要
# 建立 文字 -> 數字 的查詢辭典 
# 養成習慣留一個 <PAD> 字符, 我們習慣給0, 到時候不夠長度的句子我們就補0
inputs_char_int = {char:i+1 for i, char in enumerate(input_characters)}
inputs_char_int["<PAD>"] = 0
targets_char_int = {char:i+1 for i, char in enumerate(target_characters)}
targets_char_int["<PAD>"] = 0

In [0]:
# 建立 數字 -> 文字 的查詢辭典 
inputs_int_char = {i:char for char, i in inputs_char_int.items()}
targets_int_char = {i:char for char, i in targets_char_int.items()}

In [0]:
# 訓練參數
# batch: 多少筆資料更新一次
# embedding dimension: 先把每一種不同的東西降維
# dim: LSTM/RNN的維度
batch_size = 64 
embedding_dim = 256
dim = 256

In [0]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed

input1 = Input(shape=(None,))
x = Embedding(num_encoder_tokens, embedding_dim, mask_zero=True)(input1)
x, state_h, state_c = LSTM(dim, return_state=True)(x)

input2 = Input(shape=(None,))
y = Embedding(num_decoder_tokens, embedding_dim, name="decoder_embedding", mask_zero=True)(input2)
y, _, _ = LSTM(dim, return_sequences=True, 
               return_state=True, 
               name="decoder_lstm")(y, initial_state=[state_h, state_c])
y = Dense(num_decoder_tokens, activation='softmax', name="decoder_dense")(y)

model = Model(inputs=[input1, input2], outputs=y)
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    19456       input_9[0][0]                    
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 256)    706816      input_10[0][0]                   
____________________________________________________________________________________________

In [0]:
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam")

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np
en = []
dein = []
deout = []
for sentence in inputs:
    l = [inputs_char_int[c] for c in sentence]
    en.append(l)
en = pad_sequences(en, maxlen=max_encoder_seq_length, padding="post")
for sentence in targets:
    l = [targets_char_int[c] for c in sentence[:-1]]
    dein.append(l)
    o = [targets_char_int[c] for c in sentence[1:]]
    deout.append(o)
    
dein = pad_sequences(dein, maxlen=max_decoder_seq_length, padding="post")
deout = pad_sequences(deout, maxlen=max_decoder_seq_length, padding="post")
deout = deout.reshape(len(deout), -1, 1)

In [0]:
# 輸入資料長的樣子
for seq in en[:10]:
    print([inputs_int_char[s] for s in seq])

['H', 'i', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

In [0]:
# 正確答案長的樣子
for seq in deout[:10]:
    print([targets_int_char[s[0]] for s in seq])

['嗨', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['你', '好', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['你', '用', '跑', '的', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<

In [0]:
# 大約訓練50~100次
model.fit([en, dein], deout,
          batch_size=batch_size,
          epochs=50,
          validation_split=0.1,
          verbose=2)

Train on 19084 samples, validate on 2121 samples
Epoch 1/50
 - 159s - loss: 0.1071 - val_loss: 6.7358
Epoch 2/50
 - 159s - loss: 0.3344 - val_loss: 6.6526
Epoch 3/50
 - 158s - loss: 0.1115 - val_loss: 6.6531
Epoch 4/50
 - 158s - loss: 0.0477 - val_loss: 6.6519
Epoch 5/50
 - 156s - loss: 0.0304 - val_loss: 6.6803
Epoch 6/50
 - 158s - loss: 0.0254 - val_loss: 6.7038
Epoch 7/50
 - 157s - loss: 0.0236 - val_loss: 6.7032
Epoch 8/50
 - 156s - loss: 0.0231 - val_loss: 6.7218
Epoch 9/50
 - 156s - loss: 0.0231 - val_loss: 6.7436
Epoch 10/50
 - 158s - loss: 0.0237 - val_loss: 6.7407
Epoch 11/50
 - 156s - loss: 0.0251 - val_loss: 6.7696
Epoch 12/50
 - 156s - loss: 0.0766 - val_loss: 6.8499
Epoch 13/50
 - 157s - loss: 0.3360 - val_loss: 6.7466
Epoch 14/50
 - 157s - loss: 0.1113 - val_loss: 6.7286
Epoch 15/50
 - 157s - loss: 0.0466 - val_loss: 6.7277
Epoch 16/50
 - 157s - loss: 0.0287 - val_loss: 6.7469
Epoch 17/50
 - 157s - loss: 0.0234 - val_loss: 6.7655
Epoch 18/50
 - 158s - loss: 0.0216 - val_l

<keras.callbacks.History at 0x7fc1f4ee2fd0>

In [0]:
# 測試一下原本的他有沒有學會
import numpy as np
infer_encoder = Model(inputs=input1, outputs=[state_h, state_c])

de_h = Input(shape=(dim,))
de_c = Input(shape=(dim,))
de = model.get_layer("decoder_embedding")(input2)
dex, next_h, next_c = model.get_layer("decoder_lstm")(de, initial_state=[de_h, de_c])
dex = model.get_layer("decoder_dense")(dex)
infer_decoder = Model(inputs=[input2, de_h, de_c],
                      outputs=[dex, next_h, next_c])

test = df.head(20)
for e, z in zip(test["English"], test["Chinese"]):
    print("Encode:", e)
    test_input1 = [inputs_char_int[c] for c in e]
    test_input1 = np.array([test_input1])
    h, c = infer_encoder.predict(test_input1)

    result = ""
    i = targets_char_int["\t"]
    while True:
        o, nexth, nextc = infer_decoder.predict([np.array([i]), h, c])
        index = o.argmax()
        result = result + targets_int_char[index]
        i, h, c = index, nexth, nextc
        if len(result) >= 100 or index == targets_char_int["\n"]:
            print("Decode:", result)
            break

Encode: Hi.
Decode: 你好。

Encode: Hi.
Decode: 你好。

Encode: Run.
Decode: 你用跑的。

Encode: Wait!
Decode: 等等！

Encode: Hello!
Decode: 你好。

Encode: I try.
Decode: 讓我來。

Encode: I won!
Decode: 我贏了。

Encode: Oh no!
Decode: 不會吧。

Encode: Cheers!
Decode: 乾杯!

Encode: Got it?
Decode: 你懂了嗎？

Encode: He ran.
Decode: 他跑了。

Encode: Hop in.
Decode: 跳進來。

Encode: I lost.
Decode: 我迷失了。

Encode: I quit.
Decode: 我退出。

Encode: I'm OK.
Decode: 我沒事。

Encode: Listen.
Decode: 聽著。

Encode: No way!
Decode: 沒門！

Encode: No way!
Decode: 沒門！

Encode: Really?
Decode: 你確定？

Encode: Try it.
Decode: 試試吧。



In [0]:
# 讓他試一下從沒看過的句子
finaltest = ["This is room 666", 
             "This is my money",
             "I love your pencil", 
             "This is my pencil", 
             "I love Tom",
             "Tom love me"]
for e in finaltest:
    print("Encode:", e)
    test_input1 = [inputs_char_int[c] for c in e]
    test_input1 = pad_sequences(np.array([test_input1]), maxlen=max_encoder_seq_length, padding="post")
    h, c = infer_encoder.predict(test_input1)

    result = ""
    i = targets_char_int["\t"]
    while True:
        o, nexth, nextc = infer_decoder.predict([np.array([i]), h, c])
        index = o.argmax()
        result = result + targets_int_char[index]
        i, h, c = index, nexth, nextc
        if len(result) >= 100 or index == targets_char_int["\n"]:
            print("Decode:", result)
            break

Encode: This is room 666
Decode: 這本書是個大的。

Encode: This is my money
Decode: 這是我的鑰匙。

Encode: I love your pencil
Decode: 我愛你的鋼筆。

Encode: This is my pencil
Decode: 這是我的鋼筆。

Encode: I love Tom
Decode: 我愛湯姆。

Encode: Tom love me
Decode: 湯姆愛我的他擁有的。

