In [1]:
import os, sys
import jieba  
from config import MyConfig, MyError
    
class DataUtility(object):    
    def __init__(self):
        self.gConfig = MyConfig.get_config(config_file="./config/seq2seq.ini") 
        self.conv_path = self.gConfig["resource_data"]
        self.sep_token = self.gConfig["sep_token"]
        if not os.path.exists(self.conv_path):
            raise MyError("檔案不存在")
            exit()
        self.dir_path = self.gConfig["train_data"]
        self.file_path = self.gConfig["seq_data"]        
        os.makedirs(self.dir_path, exist_ok=True)
        if os.path.isfile(self.file_path):
            print(self.file_path, "Exist")
        else:  
            self.preprocess_train_data()
            
    def get_conv_data(self):        
        convs = []
        M = self.gConfig["m"]
        E = self.gConfig["e"]
        with open(self.conv_path, encoding="utf8") as f:
            one_conv = []
            for line in f:
                line = line.strip("\n").replace("/","")
                if line == "":
                    continue
                if line[0] == M: #是問答，放入問答
                    one_conv.append(line.split(" ")[1])
                elif line[0] == E: #是空白
                    if one_conv: #問答裡面有資料
                        convs.append(one_conv)
                    one_conv = []
        return convs
    
    def tokenize_convs(self, convs):
        seq = []
        for conv in convs:
            if len(conv) == 1:
                continue
            if len(conv)%2 != 0:
                conv = conv[:-1]
            for i in range(len(conv)):
                if i%2 ==0:
                    conv[i] = " ".join(jieba.cut(conv[i]))
                    conv[i+1] = " ".join(jieba.cut(conv[i+1]))
                    seq.append(conv[i]+self.sep_token+conv[i+1])
        return seq
    def save_tokenize_data(self, seq):        
        content = ""
        for i in range(len(seq)):
            content += seq[i] + "\n"
            if i%1000 == 0:
                print(".",end="")
        with open(self.file_path, "w", encoding="utf8") as f:
            f.write(content)
            print("finish preprocessing seq data")
        
    def preprocess_train_data(self):        
        convs = self.get_conv_data()
        seq = self.tokenize_convs(convs)
        self.save_tokenize_data(seq)
        return seq
    

In [2]:
DataUtility()

train_data/seq.data Exist


<__main__.DataUtility at 0x242c9e9a048>

In [3]:
import os, sys
import tensorflow as tf
from config import MyConfig, MyError

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GRU, Embedding

In [4]:
class Encoder(Model):
    def __init__(self):
        super(Encoder, self).__init__()
        self.gConfig = MyConfig.get_config("./config/seq2seq.ini")
        self.enc_vocab_size = self.gConfig["enc_vocab_size"]

        self.embedding_dim = self.gConfig["embedding_dim"]
        self.layer_size = self.gConfig["layer_size"]
        self.batch_size = self.gConfig["batch_size"]
        self.embedding_layer = Embedding(self.enc_vocab_size, self.embedding_dim)
        self.GRU = GRU(self.layer_size, return_sequences=True, 
                       return_state=True, recurrent_initializer="glorot_uniform")
        
    def call(self, x, hidden):
        x = self.embedding_layer(x)
        output, state = self.GRU(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.layer_size))

class BahdanauAttention(Model): #定義 Attention 機制
    def __init__(self, layer_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(layer_size)
        self.W2 = Dense(layer_size)
        self.V = Dense(1) #最後的評分網路層V，最終評分結果作為注意力的權重值
        
    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)
        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        # 計算完 W1, W2，將結果輸入評分網路層
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1) #得到各個score值的機率分布
         # 文字向量 context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values # 得到加權後的文字向量
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights #返回加權後的文字向量 和 注意力權重
        
    
class Decoder(Model):
    def __init__(self):
        super(Decoder, self).__init__()
        self.gConfig = MyConfig.get_config("./config/seq2seq.ini")
        self.dec_vocab_size = self.gConfig["dec_vocab_size"] 
        self.layer_size = self.gConfig["layer_size"]
        self.embedding_dim = self.gConfig["embedding_dim"]
        self.embedding_layer = Embedding(self.dec_vocab_size, self.embedding_dim)
        self.GRU = GRU(self.layer_size, return_sequences=True,
                       return_state=True, recurrent_initializer="glorot_uniform")
        self.fc = Dense(self.dec_vocab_size)
        self.attention = BahdanauAttention(self.layer_size)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding_layer(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.GRU(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        outputs = self.fc(output)
        return outputs, state, attention_weights
    
gConfig = MyConfig.get_config("./config/seq2seq.ini")

def loss_function(real, pred):    
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

def train_step(inp, targ, targ_lang, enc_hidden):    
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['start']] * gConfig["batch_size"], 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:,t], predictions)
            dec_input = tf.expand_dims(targ[:,t], 1)
    batch_loss = (loss/int(targ.shape[1]))
    variables = encoder.trainable_variables + encoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss      

encoder = Encoder()
attention_layer = BahdanauAttention(10)
decoder = Decoder()

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)    

checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

In [5]:

import os, sys, io, time
from config import MyConfig, MyError
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
from sklearn.model_selection import train_test_split

def preprocess_sentence(w):
    w = start_token+" "+w+" "+end_token
    return w

def create_dataset(path, num_examples):
    lines = io.open(path, encoding='utf-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
    return zip(*word_pairs)

def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize(lang):
    lang_tokenizer = Tokenizer(num_words=vocab_inp_size, oov_token=3)
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

def load_dataset(path, num_examples):
    input_lang, target_lang = create_dataset(path, num_examples)
    input_tensor, input_tokenizer = tokenize(input_lang)
    target_tensor, target_tokenizer = tokenize(target_lang)
    return input_tensor, input_tokenizer, target_tensor, target_tokenizer


gConfig = MyConfig.get_config("./config/seq2seq.ini")
file_path = gConfig["seq_data"]
num_examples = gConfig["max_train_data_size"]
start_token = gConfig["start_token"]
end_token = gConfig["end_token"]
sep_token = gConfig["sep_token"]
vocab_inp_size = gConfig["enc_vocab_size"]
vocab_tar_size = gConfig["dec_vocab_size"]
embedding_dim = gConfig["embedding_dim"]
layer_size = gConfig["layer_size"]
batch_size = gConfig["batch_size"]
checkpoint_dir = gConfig["model_data"]
epochs = gConfig["epochs"]
# global epoch_idnex
# epoch_idnex = 0 


    
input_tensor,input_tokenizer,target_tensor,target_tokenizer = load_dataset(file_path, num_examples)
max_length_inp,max_length_tar = max_length(input_tensor), max_length(target_tensor)
#input_tensor,input_token,target_tensor,target_token= read_data(gConfig['seq_data'], gConfig['max_train_data_size'])

def train():
    epoch_index = 0
    print("Preparing data in %s" % gConfig['train_data'])
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                target_tensor, test_size=0.2)
    steps_per_epoch = len(input_tensor_train) // batch_size
    BUFFER_SIZE = len(input_tensor_train)    
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    #
#     ckpt = tf.io.gfile.listdir(checkpoint_dir)
#     if ckpt:
#         print("reload pretrained model")
#         checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    
    start_time = time.time()    

    current_steps = 0
    while epoch_index < epochs:
    #while True:
        start_time_epoch = time.time()
        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0
        for (batch, (inp,targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, target_tokenizer, enc_hidden)
            total_loss += batch_loss
            print("batch_loss", batch, batch_loss.numpy())
        step_time_epoch = (time.time() - start_time_epoch) / steps_per_epoch
        step_loss = total_loss / steps_per_epoch
        current_steps += steps_per_epoch
        step_time_total = (time.time()-start_time)/current_steps 
        print('訓練總步數: {} 每步耗時: {}  最新每步耗時: {} 最新每步loss {:.4f}'\
              .format(current_steps, step_time_total, step_time_epoch, step_loss.numpy()))        
        epoch_index += 1

train() 
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint.save(file_prefix=checkpoint_prefix)
print("checkpoint saved")

Preparing data in train_data
batch_loss 0 0.49792975
batch_loss 1 0.5232644
batch_loss 2 0.45578578
batch_loss 3 0.4307389
batch_loss 4 0.43929917
batch_loss 5 0.4689792
訓練總步數: 6 每步耗時: 36.87430588404337  最新每步耗時: 36.87414010365804 最新每步loss 0.4693
batch_loss 0 0.56297344
batch_loss 1 0.4261917
batch_loss 2 0.41870934
batch_loss 3 0.49716806
batch_loss 4 0.45919925
batch_loss 5 0.48236874
訓練總步數: 12 每步耗時: 43.73586151997248  最新每步耗時: 50.59741715590159 最新每步loss 0.4744
batch_loss 0 0.48051623
batch_loss 1 0.46108463
batch_loss 2 0.51002973
batch_loss 3 0.44710365
batch_loss 4 0.49057743
batch_loss 5 0.46804363
訓練總步數: 18 每步耗時: 49.28294071886275  最新每步耗時: 60.37693258126577 最新每步loss 0.4762
checkpoint saved


In [32]:
def reload_model():
    model = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    return model


import jieba  
import numpy as np
def predict(sentence):
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    sentence = preprocess_sentence(sentence)
#     print("sentence",sentence)
#     input_words = []
#     input_words.append(start_token)
#     jieba_dicts = jieba.cut(sentence)
#     for word in jieba_dicts:
#         input_words.append(word)
#     input_words.append(end_token)
#     print("input_words",input_words)
    inputs = [input_tokenizer.word_index.get(i,3) for i in sentence.split(" ")]
    
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
   
    inputs = tf.convert_to_tensor(inputs)  
    print("inputs",inputs) 
    result = ""
    hidden = [tf.zeros((1, layer_size))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index[start_token]], 0)
    for t in range(max_length_tar):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)        
        predicted_id = tf.argmax(predictions[0]).numpy()        
       # print("predicted_id",predicted_id)
        ran = np.random.randint(low=10, high=500)
#         if target_tokenizer.index_word[predicted_id] == end_token:
#             break
        result += target_tokenizer.index_word[predicted_id + ran] + " "
        #print("pre",target_tokenizer.index_word[predicted_id + ran])
        dec_input = tf.expand_dims([predicted_id], 0)
    print("finished predict")
    return result



In [33]:
req_msg = "第一个反应就是检查门窗是否关好"
req_msg = " ".join(jieba.cut(req_msg))
res_msg = predict(req_msg)
print(res_msg)

inputs tf.Tensor([[ 2  3  3 71  3  3  3  3 17  3  0  0  0  0  0  0  0  0  0  0  0  0]], shape=(1, 22), dtype=int32)
finished predict
我要 这样 多少 一切 喵 老实 玩 美丽 叫 多 嗯 针 真心 味儿 应该 猪 倒霉 不想 π 比 那么 教 玩 小样 晚安 看不清 以为 喵 呵 要是 宇宙 我爱你 这个 一样 买不起 粉 一些 办 ' 那个 讨厌 毛 招魂 代表 把 ╰ 5 没 关注 比较 热   得 声音 呢 王子 ╮ 吗 粉 隔壁 走 耶 以为 然后 o 饭 用力 难不倒 咬 吃 帮 没 多 v 针 ╭ 看不清 的哥 注意 稀罕 一些 不无 腩 善良 老娘 在 我错 啥 宇宙 治 真 … 认识 关注 起来 tm 怪 没错 边 不理 声音 难不倒 全世界 改 按 了 发现 里 直接 啦 嘛 人家 想 善良 起来 还是 兄弟 赤峰 买 回来 高富帅 ～ 本来 当 爸爸 听 有人 后妈 你好 点饭 废话 一个 滚 性感 远 中 工作 强 就是 饭 


In [None]:
predict("第一个反应就是检查门窗是否关好")

In [None]:
aa = jieba.cut("第一个反应就是检查门窗是否关好")