In [1]:
# 下載想要的語言版本 -q: 安靜模式
!wget -q http://www.manythings.org/anki/cmn-eng.zip
# 解壓縮zip -o: 覆蓋原本
!unzip -o cmn-eng.zip

Archive:  cmn-eng.zip
  inflating: cmn.txt                 
  inflating: _about.txt              


In [2]:
import pandas as pd
df = pd.read_csv("cmn.txt",
                 sep="\t",
                 header=None)
df.columns = ["English", "Chinese", "By"]
df

Unnamed: 0,English,Chinese,By
0,Hi.,嗨。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Hi.,你好。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run.,你用跑的。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
3,Stop!,住手！,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
4,Wait!,等等！,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
...,...,...,...
29663,"If you don't want to put on sunscreen, that's ...",你不想涂防晒霜是你的问题，但是晒伤了不要来抱怨。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
29664,"Even now, I occasionally think I'd like to see...",即使是现在，我偶尔还是想见到你。不是今天的你，而是我记忆中曾经的你。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
29665,It's very easy to sound natural in your own na...,你很容易把母语说得通顺流畅，却很容易把非母语说得不自然。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
29666,"I got fired from the company, but since I have...",虽然我被公司解雇了，但是我还有点存款，所以目前不用担心生计问题。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...


In [3]:
# 使用opencc將簡中翻譯成繁中
!pip install opencc-python-reimplemented

Collecting opencc-python-reimplemented
  Downloading opencc_python_reimplemented-0.1.7-py2.py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencc-python-reimplemented
Successfully installed opencc-python-reimplemented-0.1.7


In [4]:
from opencc import OpenCC
t = OpenCC("s2tw")
df["Chinese"] = df["Chinese"].apply(lambda s:t.convert(s))
df = df.drop(["By"], axis=1)
df.to_csv("en_zh.csv")
df

Unnamed: 0,English,Chinese
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Stop!,住手！
4,Wait!,等等！
...,...,...
29663,"If you don't want to put on sunscreen, that's ...",你不想塗防曬霜是你的問題，但是曬傷了不要來抱怨。
29664,"Even now, I occasionally think I'd like to see...",即使是現在，我偶爾還是想見到你。不是今天的你，而是我記憶中曾經的你。
29665,It's very easy to sound natural in your own na...,你很容易把母語說得通順流暢，卻很容易把非母語說得不自然。
29666,"I got fired from the company, but since I have...",雖然我被公司解僱了，但是我還有點存款，所以目前不用擔心生計問題。


In [5]:
# 預處理Step1. 數出有多少種不同的詞彙
inputs = []
targets = []
input_characters = set()
target_characters = set()

for target_text, input_text in zip(df["Chinese"], df["English"]):
    # 這裡比較特別, 在真正最後預測的時候你是喂start token給decoder
    # 再等到decoder輸出end token結束
    # 所以我們自定義 \t 為 start token, \n 為 decode token
    target_text = '\t' + target_text + '\n'
    # encoder 吃的不用加 \t \n
    inputs.append(input_text)
    # decoder 吃的加 \t \n
    targets.append(target_text)
    # 把不同種的詞紀錄, 由於我們使用set, 所以會自動把重複的去掉
    for char in input_text:
        input_characters.add(char)
    for char in target_text:
        target_characters.add(char)


In [6]:
# 把我們的一些參數列出來一下
# 順便先計算最長的輸入和輸出長度, 等等喂給模型
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters) + 1
num_decoder_tokens = len(target_characters) + 1
max_encoder_seq_length = max([len(txt) for txt in inputs])
max_decoder_seq_length = max([len(txt) for txt in targets])

print("資料集筆數:", len(inputs))
print("輸入的所有字彙:", num_encoder_tokens)
print("輸出的所有字彙:", num_decoder_tokens)
print("最長的輸入句子有多長:", max_encoder_seq_length)
print('最長的輸出句子有多長:', max_decoder_seq_length)

資料集筆數: 29668
輸入的所有字彙: 79
輸出的所有字彙: 2923
最長的輸入句子有多長: 163
最長的輸出句子有多長: 46


In [7]:
# 給每一個字彙一個編號, 輸入輸出都要
# 建立 文字 -> 數字 的查詢辭典
# 養成習慣留一個 <PAD> 字符, 我們習慣給0, 到時候不夠長度的句子我們就補0
inputs_char_int = {char:i+1 for i, char in enumerate(input_characters)}
inputs_char_int["<PAD>"] = 0
targets_char_int = {char:i+1 for i, char in enumerate(target_characters)}
targets_char_int["<PAD>"] = 0

In [8]:
# 建立 數字 -> 文字 的查詢辭典
inputs_int_char = {i:char for char, i in inputs_char_int.items()}
targets_int_char = {i:char for char, i in targets_char_int.items()}

In [9]:
# 訓練參數
# batch: 多少筆資料更新一次
# embedding dimension: 先把每一種不同的東西降維
# dim: LSTM/RNN的維度
batch_size = 64
embedding_dim = 256
dim = 256

In [25]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed

input1 = Input(shape=(None,))
x = Embedding(num_encoder_tokens, embedding_dim, mask_zero=True)(input1)
x, state_h, state_c = LSTM(dim, return_state=True)(x)

input2 = Input(shape=(None,))
y = Embedding(num_decoder_tokens, embedding_dim, name="decoder_embedding", mask_zero=True)(input2)
y, _, _ = LSTM(dim, return_sequences=True,
               return_state=True,
               name="decoder_lstm")(y, initial_state=[state_h, state_c])
y = Dense(num_decoder_tokens, activation='softmax', name="decoder_dense")(y)

model = Model(inputs=[input1, input2], outputs=y)
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 input_16 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, None, 256)            20224     ['input_15[0][0]']            
                                                                                                  
 decoder_embedding (Embeddi  (None, None, 256)            748288    ['input_16[0][0]']            
 ng)                                                                                       

In [26]:
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam")

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
en = []
dein = []
deout = []
for sentence in inputs:
    l = [inputs_char_int[c] for c in sentence]
    en.append(l)
en = pad_sequences(en, maxlen=max_encoder_seq_length, padding="post")
for sentence in targets:
    l = [targets_char_int[c] for c in sentence[:-1]]
    dein.append(l)
    o = [targets_char_int[c] for c in sentence[1:]]
    deout.append(o)

dein = pad_sequences(dein, maxlen=max_decoder_seq_length, padding="post")
deout = pad_sequences(deout, maxlen=max_decoder_seq_length, padding="post")
deout = deout.reshape(len(deout), -1, 1)

In [28]:
# 輸入資料長的樣子
for seq in en[:10]:
    print([inputs_int_char[s] for s in seq])

['H', 'i', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

In [29]:
# 正確答案長的樣子
for seq in deout[:10]:
    print([targets_int_char[s[0]] for s in seq])

['嗨', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['你', '好', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['你', '用', '跑', '的', '。', '\n', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<

In [49]:
# 大約訓練50~100次
model.fit([en, dein], deout,
          batch_size=batch_size,
          epochs=50,
          validation_split=0.1,
          verbose=2)

Epoch 1/50
418/418 - 8s - loss: 0.1806 - val_loss: 6.1153 - 8s/epoch - 19ms/step
Epoch 2/50
418/418 - 9s - loss: 0.1736 - val_loss: 6.1564 - 9s/epoch - 21ms/step
Epoch 3/50
418/418 - 9s - loss: 0.1641 - val_loss: 6.2389 - 9s/epoch - 21ms/step
Epoch 4/50
418/418 - 8s - loss: 0.1590 - val_loss: 6.2940 - 8s/epoch - 18ms/step
Epoch 5/50
418/418 - 9s - loss: 0.1499 - val_loss: 6.3696 - 9s/epoch - 21ms/step
Epoch 6/50
418/418 - 9s - loss: 0.1406 - val_loss: 6.4162 - 9s/epoch - 20ms/step
Epoch 7/50
418/418 - 8s - loss: 0.1393 - val_loss: 6.4699 - 8s/epoch - 18ms/step
Epoch 8/50
418/418 - 9s - loss: 0.1319 - val_loss: 6.5291 - 9s/epoch - 21ms/step
Epoch 9/50
418/418 - 8s - loss: 0.1250 - val_loss: 6.6220 - 8s/epoch - 20ms/step
Epoch 10/50
418/418 - 8s - loss: 0.1183 - val_loss: 6.6762 - 8s/epoch - 19ms/step
Epoch 11/50
418/418 - 9s - loss: 0.1170 - val_loss: 6.6920 - 9s/epoch - 21ms/step
Epoch 12/50
418/418 - 8s - loss: 0.1124 - val_loss: 6.7444 - 8s/epoch - 19ms/step
Epoch 13/50
418/418 - 9s 

<keras.src.callbacks.History at 0x792e7bf7d330>

In [50]:
# 測試一下原本的他有沒有學會
import numpy as np
infer_encoder = Model(inputs=input1, outputs=[state_h, state_c])

de_h = Input(shape=(dim,))
de_c = Input(shape=(dim,))
de = model.get_layer("decoder_embedding")(input2)
dex, next_h, next_c = model.get_layer("decoder_lstm")(de, initial_state=[de_h, de_c])
dex = model.get_layer("decoder_dense")(dex)
infer_decoder = Model(inputs=[input2, de_h, de_c],
                      outputs=[dex, next_h, next_c])

test = df.head(20)
for e, z in zip(test["English"], test["Chinese"]):
    print("Encode:", e)
    test_input1 = [inputs_char_int[c] for c in e]
    test_input1 = np.array([test_input1])
    h, c = infer_encoder.predict(test_input1, verbose=0)

    result = ""
    i = targets_char_int["\t"]
    while True:
        o, nexth, nextc = infer_decoder.predict([np.array([[i]]), h, c], verbose=0)
        index = o.argmax()
        result = result + targets_int_char[index]
        i, h, c = index, nexth, nextc
        if len(result) >= 100 or index == targets_char_int["\n"]:
            print("Decode:", result)
            break

Encode: Hi.
Decode: 嗨。

Encode: Hi.
Decode: 嗨。

Encode: Run.
Decode: 你用跑的。

Encode: Stop!
Decode: 住手！

Encode: Wait!
Decode: 等等！

Encode: Wait!
Decode: 等等！

Encode: Begin.
Decode: 開始！

Encode: Hello!
Decode: 你好。

Encode: I try.
Decode: 我試試。

Encode: I won!
Decode: 我贏了。

Encode: Oh no!
Decode: 不會吧。

Encode: Cheers!
Decode: 乾杯!

Encode: Got it?
Decode: 知道了沒有？

Encode: Got it?
Decode: 知道了沒有？

Encode: Got it?
Decode: 知道了沒有？

Encode: He ran.
Decode: 他跑了。

Encode: Hop in.
Decode: 跳進來。

Encode: I know.
Decode: 我知道。

Encode: I quit.
Decode: 我退出。

Encode: I quit.
Decode: 我退出。



In [52]:
# 讓他試一下從沒看過的句子
finaltest = ["This is room 666",
             "This is my money",
             "I love your pencil",
             "This is my pencil",
             "I love Tom",
             "Tom love me"]
for e in finaltest:
    print("Encode:", e)
    test_input1 = [inputs_char_int[c] for c in e]
    test_input1 = pad_sequences(np.array([test_input1]), maxlen=max_encoder_seq_length, padding="post")
    h, c = infer_encoder.predict(test_input1, verbose=0)

    result = ""
    i = targets_char_int["\t"]
    while True:
        o, nexth, nextc = infer_decoder.predict([np.array([[i]]), h, c], verbose=0)
        index = o.argmax()
        result = result + targets_int_char[index]
        i, h, c = index, nexth, nextc
        if len(result) >= 100 or index == targets_char_int["\n"]:
            print("Decode:", result)
            break

Encode: This is room 666
Decode: 這是箇爸。

Encode: This is my money
Decode: 這是我的錢。

Encode: I love your pencil
Decode: 我喜歡你的意見。

Encode: This is my pencil
Decode: 這是我的電話。

Encode: I love Tom
Decode: 我愛謝！

Encode: Tom love me
Decode: 湯姆愛我。

