# Part 1. Data Preprocess

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import pickle
from gensim.models import word2vec
import logging
import random

In [None]:
from OpenFabLibrary import JeibaCutWords
from OpenFabLibrary import AppendKeywordCheck

In [None]:
LEAGAL_CLASS = 0
VIOLATE_CLASS = 1

### 讀取training set data

In [None]:
data_dir = "/".join((".", "data"))
data_source = "train.csv"
data_df = pd.read_csv(open(data_dir + '/' + data_source, 'r', encoding='utf8'), delimiter=',')
print("違法廣告: %d則" % (data_df[data_df["Class"] == 1].shape[0]))
print("合法廣告: %d則" % (data_df[data_df["Class"] == 0].shape[0]))
print(data_df.head())

### 斷詞方法選擇

In [None]:
# 斷詞處理
train_data_df = JeibaCutWords(data_df)
print(train_data_df.head())

### 關鍵字檢查

In [None]:
# 關鍵字檢查
train_data_df['keyword_flag'], _ = AppendKeywordCheck(train_data_df)
print(train_data_df.head(5))

# 儲存為pickle格式檔案
with open(data_dir + '/' + 'train_tokenized', 'wb') as file:
    pickle.dump(train_data_df, file)

## 製作文字雲

In [None]:
from OpenFabLibrary import ShowWordCloud

with open(data_dir + '/' + 'train_tokenized', 'rb') as file:
    train_tokenized_df = pickle.load(file)

ShowWordCloud(train_tokenized_df)

## Word2Vec轉換

In [None]:
with open(data_dir + '/' + 'train_tokenized', 'rb') as file:
    train_tokenized_df = pickle.load(file)

corpus_source = train_tokenized_df['sentence']
print(corpus_source)

### 設定Word2Vec參數並訓練詞向量

In [None]:
WORD2VEC_DIMENTION = 128
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

def TrainWord2VecModel(input_corpus):
    # build word2vec
    # sg=0 CBOW ; sg=1 skip-gram
    model = word2vec.Word2Vec(size=WORD2VEC_DIMENTION, min_count=5, window=5, sg=0)

    # build vocabulary
    model.build_vocab(input_corpus)

    # train word2vec model ; shuffle data every epoch
    for i in range(20):
        print("%d-th training" % (i))
        random.shuffle(input_corpus)
        model.train(input_corpus, total_examples=len(input_corpus), epochs=1)

    ## save model
    model.save('word2vec_model/CBOW')

In [None]:
# 若要用已經訓練好的詞向量，這一步可以不做
# 若要訓練新的詞向量，把註解拿掉
# 現階段使用pre-trained mode zh, 300d, 50101 words
TrainWord2VecModel(corpus_source)

In [None]:
w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')  # 載入剛剛訓練好的Word2Vec model
print(" \"%s\" 字詞相似度: " % ('改善'))
#print(w2v.wv['改善'])
w2v.wv.most_similar('改善')

In [None]:
# 列印出訓練好的詞向量
print("詞向量維度:", w2v.wv.vectors.shape)
fo = open("./word2vec.txt", "w+")
for index, vector in enumerate(w2v.wv.vectors):
    #print(vector)
    fo.write(str(vector))
fo.close()

# Part 2. Train Model

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [None]:
data_dir = "/".join((".", "data"))
train_data_source = 'train_tokenized'

### Load dataset and sentence to sequence transform

In [None]:
# load article tokenized
with open(data_dir + "/" + train_data_source, 'rb') as file:
    article_df = pickle.load(file)
article_df.head(5)

In [None]:
print("違法廣告: %d則" % (article_df[article_df["class"] == 1].shape[0]))
print("合法廣告: %d則" % (article_df[article_df["class"] == 0].shape[0]))

### create word ID mapping and word vector

In [None]:
#w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')
w2v = word2vec.Word2Vec.load('word2vec_model/zh.bin')
word2id = {k:i for i, k in enumerate(w2v.wv.vocab.keys())}
id2word = {i:k for k, i in word2id.items()}
word2id_len = len(word2id) - 1
print('word2id_len:', word2id_len)

#WORD2VEC_DIMENTION = 128
WORD2VEC_DIMENTION = 300
embedding = np.zeros((word2id_len+2, WORD2VEC_DIMENTION))
for k, v in word2id.items():
    embedding[v] = w2v.wv[k]
    # 謹慎列印，資料量很大
    #print('k=%s, v=%d'%(k, v))
    #print('embedding[v]=', embedding[v])
print(embedding)

### sentence to sequence transform

In [None]:
#
# 選取多少詞來當作輸入
#
PICK_WORDS = 40  # 選前面40個詞當作輸入

In [None]:
docs_id = []

for setence in article_df["sentence"]:
    text = setence[:PICK_WORDS]
    #print(text)
    ids = [word2id_len+1]*PICK_WORDS  # 初始化list
    
    #for w in text:
    #    if w in word2id:
    #        ids[:len(text)] = word2id[w]
    #    else
    #        ids[:len(text)] = word2id_len+1
    ids[:len(text)] = [word2id[w] if w in word2id else word2id_len+1 for w in text]
    print(ids)

    docs_id.append(ids)

#  轉換後的sequence合併到dataframe    
article_df["sentence_seq"] = docs_id
article_df.head()

In [None]:
a_docs_id = np.array(docs_id)
print(a_docs_id)

### Create data generator

In [None]:
#number_of_classes = len(category_list)
number_of_classes = 2  # 合法、違法廣告
number_of_classes_binary = 1
sample_per_class  = 8

epochs            = 500  #100
batch_size        = number_of_classes * sample_per_class
update_per_epochs = 100  #100
hidden_layer_size = 32 #64 #256
number_of_layers  = 2
learning_rate     = 0.001  #0.001
#dropout           = False
dropout_rate      = 0.5
wv                = embedding
gradient_clip_margin = 4

patience = 5  # early stop patiences
n_patience = 0

In [None]:
def train_data_generator(df, bz):
    # bz: batch size 
    dfs = [sub_df for key, sub_df in df.groupby('class')]
    df_n = len(dfs)
    
    while True:
        selected = pd.concat([sub_df.sample(int(bz/number_of_classes)) for sub_df in dfs], axis=0)
        selected = selected.sample(frac=1)
        #print("selected: ", selected)
        x = selected['sentence_seq'].tolist()
        x = np.array(x)
        #y = selected.as_matrix(columns=['class'])  # kvdbg+ # pandas for elder version 
        y = selected[["class"]].values  # kvdbg+ # pandas for new version 
        #y = pd.get_dummies(selected['class'], '').as_matrix()  # one-hot encoding
        #print(y)
        
        yield x, y
        
def test_data_generator(df, docs_id):
    #print(df)
    docs_id = np.array(docs_id)
    xx = docs_id[df.index]
    x = df['sentence_seq'].tolist()
    x = np.array(x)
    #print(df[["sentence","sentence_seq"]])
    #y = df.as_matrix(columns=['class']) # pandas for elder version 
    y = df[["class"]].values # pandas for new version
    #kvdbg-y = df['class'].as_matrix()
    return x, y

### 切割資料準備訓練

In [None]:
train_df, validate_df = train_test_split(article_df, test_size=0.2, shuffle=True, stratify=article_df['class'])

train_generate = train_data_generator(train_df, batch_size)
X_test, y_test = test_data_generator(validate_df, docs_id)

## Create LSTM

In [None]:
def opt_loss(logits, targets, learning_rate, grad_clip_margin):
    #loss = tf.reduce_sum(tf.pow(logits - targets, 2))/batch_size
    #kvdbg-cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=logits))
    cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits)) #kvdbg+
     

    #Cliping the gradient loss
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    gradients = optimizer.compute_gradients(cross_entropy)
    capped_gradients = [(tf.clip_by_value(grad, (-1)*grad_clip_margin, grad_clip_margin), var) for grad, var in gradients if grad is not None]
    train_optimizer = optimizer.apply_gradients(capped_gradients)   
    #curr_learning_rate = (optimizer._lr_t * tf.sqrt(1 - optimizer._beta1) / (1 - optimizer._beta2))
    curr_learning_rate = optimizer._lr_t
    

    return cross_entropy, train_optimizer, curr_learning_rate

In [None]:
main_graph = tf.Graph()

with main_graph.as_default():    
    ##defining placeholders##
    with tf.name_scope('input_layer'):
        inputs = tf.placeholder(tf.int32, [None, PICK_WORDS], name='input_data')
        tf.add_to_collection("training_collection", inputs)  # 把這個變數存起來
        
        targets = tf.placeholder(tf.float32, [None, number_of_classes_binary], name='targets')
        tf.add_to_collection("training_collection", inputs)  # 把這個變數存起來
        
        bz = tf.placeholder(tf.int32, [], name='batch_size')
        
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        tf.add_to_collection("training_collection", inputs)  # 把這個變數存起來
        
    ## embedding lookup table
    with tf.variable_scope('embedding_layer'):    
        em_W = tf.Variable(wv.astype(np.float32), trainable=True)  #wv.shape = (sentences_count, word2vec_dimension)
        x = tf.nn.embedding_lookup(em_W, inputs)    #x.shape = (?, PICK_WORDS, word2vec_dimension)
        
    ##LSTM layer##
    ##Bi-directional LSTM
    with tf.variable_scope("Bidirectional_LSTM_layer"):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_layer_size)
        #if dropout:
        lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
            
        init_state_fw = lstm_cell.zero_state(tf.shape(inputs)[0], tf.float32)
        init_state_bw = lstm_cell.zero_state(tf.shape(inputs)[0], tf.float32)
        
        ((outputs_fw, outputs_bw), (outputs_state_fw, outputs_state_bw)) = \
        tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, x, 
                                        initial_state_fw=init_state_fw,
                                        initial_state_bw=init_state_bw)
        
        outputs = tf.concat((outputs_fw, outputs_bw), 2)
        print(outputs)
        #final_state_c = tf.concat((outputs_state_fw.c, outputs_state_bw.c), 1)
        #final_state_h = tf.concat((outputs_state_fw.h, outputs_state_bw.h), 1)
        #outputs = tf.contrib.rnn.LSTMStateTuple(c=final_state_c, h=final_state_h)

    ##Output layer##   
    with tf.variable_scope('output_layer'):
        x = outputs[:, -1, :] 
        logits = tf.layers.dense(inputs=x, units=number_of_classes_binary, activation=None,
                                 kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=0.005),)
        tf.add_to_collection("training_collection", logits)  # 把這個變數存起來
        
        class_prob = tf.nn.sigmoid(logits, name='class_probability')
        tf.add_to_collection("training_collection", class_prob)  # 把這個變數存起來
        
    ##loss and optimization##
    with tf.name_scope('loss_and_opt'):
        loss, opt, curr_lr = opt_loss(logits, targets, learning_rate, gradient_clip_margin)
    
    ##accuracy
    with tf.name_scope('evaluate'):
        predictions = tf.greater(class_prob, 0.5, name="predictions")
        tf.add_to_collection("training_collection", inputs)  # 把這個變數存起來
        
        correct_prediction = tf.equal(tf.round(class_prob), targets)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # 建立 saver 物件
    saver = tf.train.Saver()
    
    init = tf.global_variables_initializer()

### Train Neural Network

In [None]:
with tf.Session() as sess:
    sess = tf.Session(graph=main_graph)
    sess.run(init)

In [None]:
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

for i in range(epochs):
    traind_loss = []
    traind_scores = []
    test_scores = []

    for j in range(update_per_epochs):
        X_batch, y_batch = next(train_generate) 
        
        train_logits, train_prob, train_loss, train_acc, optimizer = \
        sess.run([logits, class_prob, loss, accuracy, opt], 
                 feed_dict={inputs:X_batch,
                            targets:y_batch,
                            bz:np.array(batch_size),
                            keep_prob:(1 - dropout_rate)})
        
        traind_loss.append(train_loss)
        traind_scores.append(train_acc)
        
    #kvdbg- y_test_onehot = pd.get_dummies(y_test, '')  # one-hot encoding
    #print('y_test:', y_test)
    
    test_logits, test_prob, test_loss, test_acc = \
    sess.run([logits, class_prob, loss, accuracy], 
             #kvdbg-feed_dict={inputs:X_test, targets:y_test_onehot, bz:np.array(len(X_test))})
             feed_dict={inputs:X_test, 
                        targets:y_test, 
                        bz:np.array(len(X_test)),
                        keep_prob:(1 - dropout_rate)})
    
    #print("test_prob: ", test_prob)
    
    train_loss_list.append(np.mean(traind_loss))
    train_acc_list.append(np.mean(traind_scores))
    
    test_loss_list.append(test_loss)
    test_acc_list.append(test_acc)
    
    if (i % 1) == 0:
        print('Epoch {}/{}'.format(i, epochs), 
              ' Train loss: {:.3f}'.format(np.mean(traind_loss)),'Train acc: {:.3f}'.format(np.mean(traind_scores)),
              ' Test loss: {:.3f}'.format(test_loss), ' Test acc: {:.3f}'.format(test_acc))
        
    if test_loss >= np.min(test_loss_list):
        n_patience += 1
    else:
        n_patience = 0

    if n_patience > patience:
        print("The model didn't improve for %i rounds, break it!" % patience)
        break

In [None]:
print('loss')
plt.plot(np.arange(len(train_loss_list)), train_loss_list, 'b', label = 'train')
plt.plot(np.arange(len(test_loss_list)), test_loss_list, 'r', label = 'test')
plt.legend()
plt.show()

print('accuracy')
plt.plot(np.arange(len(train_acc_list)), train_acc_list, 'b', label = 'train')
plt.plot(np.arange(len(test_acc_list)), test_acc_list, 'r', label = 'test')
plt.legend(loc = 4)
plt.show()

### save mode 

In [None]:
save_path = saver.save(sess, "./model/lstm_model") # 儲存模型到 /tmp/model.ckpt
print(save_path)

### Freeze model

In [None]:
with tf.Session() as sess:

    #初始化variable
    sess.run(tf.global_variables_initializer())

    #擷取最新的checkpoint
    latest_ckpt = tf.train.latest_checkpoint('./model/')

    #載入graph
    restore_saver = tf.train.import_meta_graph('./model/lstm_model.meta')

    #恢复图，即将weights等参数加入图对应位置中
    restore_saver.restore(sess, latest_ckpt)
    #print(tf.get_collection('training_collection'))  # get_collection返回list，裡面存放訓練模型時候的變數
        
    #graph variable轉為常量
    output_graph_def = tf.graph_util.convert_variables_to_constants(
        sess, sess.graph_def, ["output_layer/class_probability"] )
    
    #graph寫入pb file
    #model_f = tf.gfile.GFile("./model/frozen_model.pb","wb")
    #model_f.write(output_graph_def.SerializeToString())
    tf.train.write_graph(output_graph_def, './model', 'frozen_model.pb',as_text=False)
    print ('{} ops in the final graph.'.format(len(output_graph_def.node)))

# Part 3. Run Predict

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from OpenFabLibrary import JeibaCutWords
from OpenFabLibrary import AppendKeywordCheck

### create word ID mapping and word vector

In [None]:
data_dir = "/".join((".", "data"))

In [None]:
#w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')
w2v = word2vec.Word2Vec.load('word2vec_model/zh.bin')
word2id = {k:i for i, k in enumerate(w2v.wv.vocab.keys())}
id2word = {i:k for k, i in word2id.items()}
word2id_len = len(word2id) - 1
print('word2id_len:', word2id_len)

##  AI預測 + 關鍵字檢查

In [None]:
def jieba_validation(input_text):
    single_ad = 1  # 若是單一則廣告輸入，設 1
               # 若是一大批廣告輸入，設 0
        
    ad_ID = 0
    ad_Name = "測試產品"
    ad_Class = 0

    ad_Description = input_text
    
    if single_ad:
        # 單一廣告輸入
        test_data_df = pd.DataFrame({'ID': [ad_ID], 
                                     'Name':[ad_Name],
                                     'Description':[ad_Description],
                                     'Class':[ad_Class]})
    else:
        # 大批廣告輸入
        test_data_source = "test_private.csv"
        test_data_df = pd.read_csv(open(data_dir + '/' + test_data_source, 'r', encoding='utf8'), delimiter=',')


    # 斷詞處理
    test_df = JeibaCutWords(test_data_df)

    # 關鍵字檢查
    test_df['keyword_flag'], keywords_list = AppendKeywordCheck(test_df)
    
    #
    # 選取多少詞來當作輸入
    #
    PICK_WORDS = 40  # 選前面40個詞當作輸入，這個長度要跟訓練模型的長度一樣
    batch_size = 16  # 若是資料筆數很多，一次讀batch_size筆資料來預測

    docs_pred_id = []
    for doc in test_df['sentence']:
        text = doc[:PICK_WORDS]
        ids = [word2id_len+1]*PICK_WORDS
        ids[:len(text)] = [word2id[w] if w in word2id else word2id_len+1 for w in text]
        docs_pred_id.append(ids)

    # 轉換後的sequence合併到dataframe    
    test_df['sentence_seq'] = docs_pred_id

    x = test_df['sentence_seq'].tolist()
    X_pred = np.array(x)
    #y_actual = test_df['class'].as_matrix() # pandas for elder version 
    #y_keyword_flag = test_df['keyword_flag'].as_matrix() # pandas for elder version 
    y_actual = test_df['class'].values # pandas for new version 
    y_keyword_flag = test_df['keyword_flag'].values # pandas for new version 
    
    #
    # Load trained model and feed data to predict
    #
    pred_input = X_pred
    pred_batch_size = batch_size
    output_class = []
    output_probability = []

    with tf.gfile.GFile("./model/frozen_model.pb", "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        
    with tf.Graph().as_default() as graph:
        # The name var will prefix every op/nodes in your graph
        # Since we load everything in a new graph, this is not needed
        tf.import_graph_def(graph_def, name="prefix")
        
    with tf.Session(graph=graph) as sess:
        #saver = tf.train.import_meta_graph('./model/lstm_model.meta')
        #saver.restore(sess, tf.train.latest_checkpoint('./model/'))
        #graph = tf.get_default_graph()
            
        inputs = graph.get_tensor_by_name('prefix/input_layer/input_data:0')
        keep_prob = graph.get_tensor_by_name('prefix/input_layer/keep_prob:0')
        class_prob = graph.get_tensor_by_name('prefix/output_layer/class_probability:0')
        #predict_out = graph.get_tensor_by_name('prefix/evaluate/predictions:0')
        
        for start in range(0, len(pred_input), pred_batch_size):
            end = min(start + batch_size, len(pred_input))

            x_pred_batch = pred_input[start:end]        

            if np.ndim(x_pred_batch)==1:
                x_pred_batch = x_pred_batch.reshape([1,-1])

            #
            # 把剛剛載入的模型拿來用
            #
            #pred_result, pred_prob = sess.run([predict_out, class_prob],
            #                                  feed_dict = {inputs:x_pred_batch})
            pred_prob = sess.run([class_prob], feed_dict = {inputs:x_pred_batch, keep_prob:1})
            pred_result = np.around(pred_prob)  #四捨五入，機率 > 0.5，視為class "1"

            output_class.extend(pred_result)
            output_probability.extend(pred_prob)

    # 預測的類別
    y_pred_class = output_class
    

    # 預測的類別機率值
    #kvdbg-Legal_prob = output_probability[:,0]    # column[0]是class 0的機率
    #kvdbg-Violate_prob = output_probability[:,1]  # column[1]是class 1的機率
    
    if single_ad:
        # 單一廣告判別
        if y_pred_class[0] == 0:
            keywords_list = []  # 合法廣告不用列出違規關鍵字
            return "合法", output_probability, keywords_list
        else:
            return "違法", output_probability, keywords_list
    else:
        # 大批廣告判別
        return y_pred_class, output_probability, keywords_list

### 載入測試資料集，並進行預測

In [None]:
# 單一廣告
ad_text = "含500億活菌數及八種益生菌，排便不順，氣味難聞，當心健康拉警報\
服用本產品可達到體內環保、增強抵抗力並強化細胞功能，可改善體質、促進新陳代謝、幫助維持消化道機能、促進食慾、開胃，促進腸道蠕動改變細菌叢生態，使排便順暢。\
"

result, probability, keywords = jieba_validation(ad_text)

print("辨識結果: ", result)
print("違規機率: ", probability)
print("違規字詞: ", keywords)