In [1]:
import json
from opencc import OpenCC
cc = OpenCC('s2t')

In [2]:
textsC = []
textsE = []
fileName = './translation2019zh_valid.json'
file = open(fileName, 'r', encoding='utf-8')
dataR = []
for line in file.readlines():
    dataLine = json.loads(line)
    dataR.append(dataLine)

for data in dataR:
    textsE.append(data['english'])
    textsC.append(cc.convert(data['chinese']))

In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame(columns = ['English'])
df

Unnamed: 0,English


In [5]:
df['English'] = textsE
df['Chinese'] = textsC

In [6]:
df

Unnamed: 0,English,Chinese
0,"Slowly and not without struggle, America began...",美國緩慢地開始傾聽，但並非沒有艱難曲折。
1,I didn't own a Thesaurus until four years ago ...,直到四年前我纔有了一本詞典。我使用的是用89美分在K市場裏買來的一本韋氏小詞典。我從來不使用...
2,"portlet, you must write three short deployment...",portlet 之後，您必須編寫三個簡短的部署描述符：web.xml、portlet.xml...
3,Dithering is a technique that blends your colo...,抖動是關於顏色混合的技術，使你的作品看起來更圓滑，或者只是創作有趣的材質。
4,This paper discusses the petrologic characteri...,本文以琿春早第三紀含煤盆地的地質構違背景爲依據，分析了煤系地層的岩石學特徵。
...,...,...
39318,The bill will now have to be passed by the Upp...,目前，這項法案還必須獲得印度議會上院的通過。
39319,The influences of thermal debinding temperatur...,研究了熱脫脂溫度、時間、不同脫脂方式以及粘結劑組成對脫脂坯碳含量的影響。
39320,The man and woman are short and heigh .,這個男人，和這個女人都和瘦長。
39321,That same old-fashioned car that I had seen ea...,我之前看見的那輛老式轎車還停在原來的地方，也就是警察局對面。


In [7]:
from tensorflow import keras 

In [8]:
from __future__ import print_function

In [9]:
import numpy as np

In [10]:
# 參數設定
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [12]:
# 讀取資料檔，並將所有單字整理為字典，分別為英文及中文字典，注意，英文為字母的集合，非單字(Word)
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for item in textsE:
    input_text = item
    input_texts.append(input_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)

for item in textsC:
    target_text = '\t' + item + '\n'
    target_texts.append(target_text)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [13]:
print(target_texts[:10])

['\t美國緩慢地開始傾聽，但並非沒有艱難曲折。\n', '\t直到四年前我纔有了一本詞典。我使用的是用89美分在K市場裏買來的一本韋氏小詞典。我從來不使用單詞處理程序。\n', '\tportlet 之後，您必須編寫三個簡短的部署描述符：web.xml、portlet.xml 和 geronimo-web.xml（這其中的一些文件可能已經由 IDE 生成）。\n', '\t抖動是關於顏色混合的技術，使你的作品看起來更圓滑，或者只是創作有趣的材質。\n', '\t本文以琿春早第三紀含煤盆地的地質構違背景爲依據，分析了煤系地層的岩石學特徵。\n', '\t55歲以上的女人們對自己伴侶更爲挑剔。\n', '\t所以，要“治療“他人你必須沒有任何偏好。\n', '\t第二次事件跟我爺爺的寶貝匣子有關。\n', '\t將 <ejb-link> 標記的值更改爲 MyEJB，即在 ejb-jar.xml 文件中定義的 EJB 名稱。\n', '\t解決這些挑戰的途徑包括依照麻瓜在南非的經驗設立真相與和解委員會。\n']


In [14]:
print(input_texts[:10])

['Slowly and not without struggle, America began to listen.', "I didn't own a Thesaurus until four years ago and I use a small Webster's dictionary that I'd bought at K-Mart for 89 cents.", 'portlet, you must write three short deployment descriptors: web.xml, portlet.xml, and geronimo-web.xml. (Some of these may have been generated by your IDE.)', 'Dithering is a technique that blends your colors together, making them look smoother, or just creating interesting textures.', 'This paper discusses the petrologic characteristics of the coal-bearing strata under the geologic structural background of the Tertiary coal basin in Hunchun.', 'Women over 55 are pickier about their partners than at any other time in their lives.', 'Ruben: So, to heal (with capital letters) you need to have no predilections.', "The second encounter relates to my grandfather's treasure box.", 'Change the value for the <ejb-link> tag to MyEJB, which is the name of the EJB as defined in the ejb-jar.xml file.', 'One wa

In [15]:
# 字典排序            
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

In [16]:

# 計算編碼器、解碼器的最大長度
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [17]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 39323
Number of unique input tokens: 849
Number of unique output tokens: 5346
Max sequence length for inputs: 373
Max sequence length for outputs: 199


In [18]:
# 以dict儲存字典單字及序號
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [21]:
# 設定編碼器、解碼器input起始值(均為0矩陣)
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

MemoryError: Unable to allocate 46.4 GiB for an array with shape (39323, 373, 849) and data type float32