In [20]:
import jieba
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.nn import embedding_lookup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
path_cn = '../Resource/archive_/chinese.zh'
path_en = '../Resource/archive_/english.en'

In [13]:
def load_data(path):
    with open(path, encoding="utf-8") as f:
        data = pd.Series(f.readlines())
        data = data.str.replace('\n', ' ')
    
    return data

In [14]:
df_cn = load_data(path_cn)
df_en = load_data(path_en)

In [15]:
df_cn = df_cn.apply(lambda x: ' '.join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 0.691 seconds.
Prefix dict has been built successfully.


In [26]:
df_cn_train = df_cn[: int(len(df_cn) * 0.7)]
df_en_train = df_en[: int(len(df_cn) * 0.7)]

In [73]:
'''
    Use the Tokenizer to transform the word into number 
    num_words is None means that all words will be contain
    Example:
    If have a sentence is "I love my dog"
    After "fit_on_texts" is {'I': 1, 'love': 2, 'my': 3, 'dog': 4}
    From the mentioned example:
    After "texts_to_sequences", the sentence becames [1, 2, 3, 4]
    "pad_sequences" is used to let all sentences as the same list size.
'''
 max_len = 140
def to_vector(data):
    tokenizer = Tokenizer(num_words=None, char_level=False)
    tokenizer.fit_on_texts(data)
    seq = tokenizer.texts_to_sequences(data)
    vector = pad_sequences(seq, padding='post', maxlen=max_len)
    return vector, len(tokenizer.word_index)

In [74]:
cn_vector, total_cn = to_vector(df_cn_train)
en_vector, total_en = to_vector(df_en_train)

In [80]:
len(cn_vector)
en_vector[0].size
en_vector.shape

(176943, 140)

In [89]:
max_sequence_length_cn = cn_vector[0].size 
max_sequence_length_en = en_vector[0].size
learning_rate = 0.01
def initial_model():
    model = keras.Sequential([
        keras.layers.Embedding(total_cn+1, max_sequence_length_en, input_length=max_sequence_length_cn),
        keras.layers.Bidirectional(keras.layers.LSTM(32, return_sequences=True)),
        keras.layers.Dense(total_en+1 ,activation='softmax')
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [90]:
model = initial_model()

In [91]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 140, 140)          10868760  
_________________________________________________________________
bidirectional_9 (Bidirection (None, 140, 64)           44288     
_________________________________________________________________
dense_8 (Dense)              (None, 140, 58632)        3811080   
Total params: 14,724,128
Trainable params: 14,724,128
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(cn_vector, en_vector, batch_size=25, epochs=10)

Epoch 1/10