In [10]:
import sys
import numpy as np
import gensim
import os
import tensorflow as tf
from gensim.models.doc2vec import Doc2Vec,LabeledSentence
# from sklearn.cross_validation import train_test_split
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import re

In [11]:
datasource_dir = r'/home/alexsun/ML/data_center/standford_large_movie_review_dataset/aclImdb/'

class LabeledLineSentence(object):
    def __init__(self, dir_name, tag='DATA'):
        self.dir_name = dir_name
        self.tag = tag
        
    @staticmethod
    def multiple_replace(text, adict):
        rx = re.compile('|'.join(map(re.escape, adict)))
        def one_xlat(match):
            return adict[match.group(0)]
        return rx.sub(one_xlat, text)
    
    @staticmethod
    def txt_process(txt):
        text_replace_dic = {'<br />':'', '\n':' ', '.':'', '?':'', '*':'', '$':''}
        return LabeledLineSentence.multiple_replace(txt, text_replace_dic)
    
    def __iter__(self):
        for uid, filename in enumerate(os.listdir(self.dir_name)):
            with open(os.path.join(self.dir_name, filename), encoding='utf-8') as f:
                new_line = LabeledLineSentence.txt_process(f.read())
                new_line = [word.lower() for word in new_line.split()]
                yield LabeledSentence(new_line ,[self.tag + str(uid)])

In [12]:

train_data_pos = LabeledLineSentence(os.path.join(datasource_dir, 'train/pos/'), 'TRAIN_POS')
print('1')
train_data_neg = LabeledLineSentence(os.path.join(datasource_dir, 'train/neg/'), 'TRAIN_NEG')
print("2")
test_data_pos = LabeledLineSentence(os.path.join(datasource_dir, 'test/pos/'), 'TEST_POS')
print("3")
test_data_neg = LabeledLineSentence(os.path.join(datasource_dir, 'test/neg/'), 'TEST_NEG')
print("4")
class All_corpus():
    def __init__(self):
        self.all_data = [train_data_neg, train_data_pos, test_data_neg, test_data_pos]
    def __iter__(self):
        for data in self.all_data:
            for item in data:
                yield item

def doc2vec_train(size=500):
    save_pickle = os.path.join(datasource_dir, 'pickles/doc2vec.pickle')
    if os.path.exists(save_pickle):
        print('doc2vec.pickle already there! we just ignore training again')
        model = Doc2Vec.load(save_pickle)
        return model

    model = Doc2Vec(min_count=5, window=10, size=size)
    total_examples = 12500
    model.build_vocab(All_corpus())
    print('vocab builded!')
    # 训练训练数据
    for x_train in [train_data_neg, train_data_pos]:
        model.train(x_train, total_examples=total_examples, epochs=5)
    print('train data all processed!')
    # 训练测试数据
    for x_test in [test_data_neg, test_data_pos]:
        model.train(x_test, total_examples=total_examples, epochs=5)
    print('test data all processed!')
    model.save(save_pickle)
    return model

doc2vec_model = doc2vec_train()
                  


1
2
3
4
doc2vec.pickle already there! we just ignore training again


In [13]:
##读取向量
def getVecs(model, corpus, size=500):
    vecs = [np.array(model.docvecs[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

# train_pos = getVecs(model, train_data_pos)

In [14]:
# print(train_pos.shape)

In [15]:

vocab_size = len(doc2vec_model.wv.vocab.keys())
keys = doc2vec_model.wv.vocab
print('total vocab size:', vocab_size)
print(type(keys))
print(keys['china'])
print(type(doc2vec_model.docvecs))

total vocab size: 61271
<class 'dict'>
Vocab(count:273, index:3148, sample_int:4294967296)
<class 'gensim.models.doc2vec.DocvecsArray'>


### RNN Model 

In [16]:
# 输入层
def build_inputs(num_seqs, vec_size=500):
    '''
    构建输入层
    
    num_seqs: 每个batch中的序列个数
    vec_size: 每个batch中向量的长度
    '''
    inputs = tf.placeholder(tf.float32, shape=(num_seqs, vec_size), name='inputs')
    targets = tf.placeholder(tf.int32, shape=(num_seqs), name='targets')
    
    # 加入keep_prob
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

In [17]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    # build a lstm cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(cell=lstm, output_keep_prob=keep_prob)
    # stack the lstm layers
    cell = tf.contrib.rnn.MultiRNNCell([drop for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state
# what's cell.zero_state().__doc__ ?
print( tf.contrib.rnn.BasicLSTMCell(10).zero_state.__doc__)

Return zero-filled state tensor(s).

    Args:
      batch_size: int, float, or unit Tensor representing the batch size.
      dtype: the data type to use for the state.

    Returns:
      If `state_size` is an int or TensorShape, then the return value is a
      `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.

      If `state_size` is a nested list or tuple, then the return value is
      a nested list or tuple (of the same structure) of `2-D` tensors with
    the shapes `[batch_size x s]` for each s in `state_size`.
    


In [23]:
def build_output(lstm_output, in_size, out_size):
    ''' 
    构造输出层     
    lstm_output: lstm层的输出结果
    in_size: lstm输出层重塑后的size
    out_size: softmax层的size
    '''

    # 将lstm的输出按照列concate，例如[[1,2,3],[7,8,9]],
    # tf.concat的结果是[1,2,3,7,8,9]
    # seq_output = tf.concat(lstm_output, axis=1) # tf.concat(concat_dim, values)
    # reshape
    x = lstm_output
    print(x.shape)
    
    #  tf 变量共享问题总结
    #　注意力层
    # with tf.variable_scope('attention'):
    # why this shape?
    attention_w = tf.get_variable(name="attention", shape=[100, 1, 500],
                                      initializer=tf.random_normal_initializer())
    
    x = tf.matmul(attention_w, x)
    # 将注意力层输出与softmax层全连接
    # with tf.variable_scope('softmax'):
        # softmax_w = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.1))
        # softmax_b = tf.Variable(tf.zeros(out_size))
    
    softmax_w = tf.get_variable(name="sw", shape=[in_size, out_size],
                                      initializer=tf.random_normal_initializer())
    softmax_b = tf.get_variable(name="sb", shape=(out_size),
                                      initializer=tf.constant_initializer(0))
    x = tf.reshape(x, [-1, 64])
    # 计算logits
    logits = tf.matmul(x, softmax_w) + softmax_b
    # softmax层返回概率分布
    out = tf.nn.softmax(logits, name='predictions')
    return out, logits


def build_loss(out, logits, targets, lstm_size, num_classes):
    '''
    根据logits和targets计算损失
    
    logits: 全连接层的输出结果（不经过softmax）
    targets: targets
    lstm_size
    num_classes: vocab_size
        
    '''
    # One-hot编码
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    correct_prediction = tf.equal(tf.argmax(y_reshaped, 1), tf.argmax(out, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    # y_reshaped = y_one_hot
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    return accuracy, loss


def build_optimizer(loss, learning_rate, grad_clip):
    ''' 
    构造Optimizer
   
    loss: 损失
    learning_rate: 学习率
    
    '''
    
    # 使用clipping gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer




In [19]:
# 执行训练准备
# 训练参数设置

batch_size = 100         # Sequences per batch
num_steps = 500          # Number of vector size
lstm_size = 64         # Size of hidden layers in LSTMs
num_layers = 1          # Number of LSTM layers
learning_rate = 0.001    # Learning rate
keep_prob = 0.5         # Dropout keep probability
num_classes = 2  # 做情感分析时，　之后正向和负像情感类型


In [20]:

# 训练数据准备：
def get_batches(b_size = batch_size, vec_size = num_steps):
    total_length = 12500
    cnt = 0
    batch_perclass = 50
    pos_label = 'TRAIN_POS'
    neg_label = 'TRAIN_NEG'
    for i in range(0, total_length, batch_perclass):
        x = np.zeros([b_size, vec_size])
        y = []
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[pos_label + str(i+j)]
            y.append(1)
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[neg_label + str(i+j)]
            y.append(0)
        y = np.array(y, dtype=np.int32)
        # y = y.reshape(-1, b_size)
        r = np.random.permutation(len(y))
        y = y[r]
        x = x[r, :]
        # y = y.reshape([-1, b_size])
        yield x, y

        
# 测试数据准备
def get_batches_test(b_size = batch_size, vec_size = num_steps):
    total_length = 12500
    cnt = 0
    batch_perclass = 50
    pos_label = 'TEST_POS'
    neg_label = 'TEST_NEG'
    for i in range(0, total_length, batch_perclass):
        x = np.zeros([b_size, vec_size])
        y = []
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[pos_label + str(i+j)]
            y.append(1)
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[neg_label + str(i+j)]
            y.append(0)
        y = np.array(y, dtype=np.int32)
        # y = y.reshape(-1, b_size)
        r = np.random.permutation(len(y))
        y = y[r]
        x = x[r, :]
        # y = y.reshape([-1, b_size])
        yield x, y


In [21]:
# 组合模型

class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # 如果sampling是True，则采用SGD
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # 输入层
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # LSTM层
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)

        ## 对输入进行one-hot编码
        # x_one_hot = tf.one_hot(self.inputs, num_classes)
        _input = tf.reshape(self.inputs, [batch_size, num_steps, -1])
        # 运行RNN
        outputs, state = tf.nn.dynamic_rnn(cell, _input, initial_state=self.initial_state,dtype=tf.float32)
        self.final_state = state
        
        # 预测结果
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss 和 optimizer (with gradient clipping)
        self.accuracy, self.loss = build_loss(self.prediction, self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

In [26]:
# 执行训练
import time
epochs = 20
# 每n轮进行一次变量保存
save_every_n = 500
model = CharRNN(num_classes, batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _, _acc= sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer,
                                                 model.accuracy
                                                 ], 
                                                 feed_dict=feed)
            
            end = time.time()
            # control the print lines
            if counter % 100 == 0:
                print('轮数: {}/{}... '.format(e+1, epochs),
                      '训练步数: {}... '.format(counter),
                      '训练误差: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)),
                      '准确率:{}%'.format(_acc * 100.0)
                      )

            if (counter % save_every_n == 0):
                saver.save(sess, "./save/lstm-model", global_step=counter)
                print("saved! step:", save_every_n)

        saver.save(sess, "./save/final-model")
    # saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

(100, 500, 64)
轮数: 1/20...  训练步数: 100...  训练误差: 0.7129...  0.6340 sec/batch 准确率:50.999999046325684%
轮数: 1/20...  训练步数: 200...  训练误差: 0.5983...  0.6418 sec/batch 准确率:66.00000262260437%
轮数: 2/20...  训练步数: 300...  训练误差: 0.3517...  0.6469 sec/batch 准确率:87.00000047683716%
轮数: 2/20...  训练步数: 400...  训练误差: 0.3572...  0.6283 sec/batch 准确率:87.99999952316284%
轮数: 2/20...  训练步数: 500...  训练误差: 0.1256...  0.6226 sec/batch 准确率:95.99999785423279%
saved! step: 500
轮数: 3/20...  训练步数: 600...  训练误差: 0.1345...  0.6298 sec/batch 准确率:95.99999785423279%
轮数: 3/20...  训练步数: 700...  训练误差: 0.1598...  0.6265 sec/batch 准确率:95.99999785423279%
轮数: 4/20...  训练步数: 800...  训练误差: 0.0285...  0.6778 sec/batch 准确率:99.00000095367432%
轮数: 4/20...  训练步数: 900...  训练误差: 0.1421...  0.6264 sec/batch 准确率:98.00000190734863%
轮数: 4/20...  训练步数: 1000...  训练误差: 0.1816...  0.6266 sec/batch 准确率:95.99999785423279%
saved! step: 500
轮数: 5/20...  训练步数: 1100...  训练误差: 0.0390...  0.6286 sec/batch 准确率:99.00000095367432%
轮数: 5/20...  训练步数: 1200.

In [29]:
def test():
    checkpoint_dir = "./save/"
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if ckpt and ckpt.model_checkpoint_path:  
            saver.restore(sess, ckpt.model_checkpoint_path)  
        else:
            print("some error occured! drop test prcocess")
            return
            pass 
        loss = 0
        counter = 0
        for x, y in get_batches_test(batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob}
                    # model.initial_state: new_state}
            batch_loss, new_state,_acc= sess.run([model.loss, 
                                                 model.final_state, 
                                                 # model.optimizer,
                                                 model.accuracy
                                                 ], 
                                                 feed_dict=feed)
            end = time.time()
            if counter % 2 == 0:
                print(
                      '测试步数: {}... '.format(counter),
                      '测试误差: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)),
                      '测试准确率:{}%'.format(_acc * 100.0)
                      )

                
test()    
    

INFO:tensorflow:Restoring parameters from ./save/final-model
测试步数: 2...  测试误差: 0.0000...  0.2678 sec/batch 测试准确率:100.0%
测试步数: 4...  测试误差: 0.0000...  0.2690 sec/batch 测试准确率:100.0%
测试步数: 6...  测试误差: 0.0000...  0.2706 sec/batch 测试准确率:100.0%
测试步数: 8...  测试误差: 0.0000...  0.2676 sec/batch 测试准确率:100.0%
测试步数: 10...  测试误差: 0.0000...  0.2699 sec/batch 测试准确率:100.0%
测试步数: 12...  测试误差: 0.0000...  0.2716 sec/batch 测试准确率:100.0%
测试步数: 14...  测试误差: 0.0000...  0.2669 sec/batch 测试准确率:100.0%
测试步数: 16...  测试误差: 0.0000...  0.2758 sec/batch 测试准确率:100.0%
测试步数: 18...  测试误差: 0.0000...  0.2658 sec/batch 测试准确率:100.0%
测试步数: 20...  测试误差: 0.0000...  0.2707 sec/batch 测试准确率:100.0%
测试步数: 22...  测试误差: 0.0000...  0.2680 sec/batch 测试准确率:100.0%
测试步数: 24...  测试误差: 0.0000...  0.3396 sec/batch 测试准确率:100.0%
测试步数: 26...  测试误差: 0.0000...  0.2706 sec/batch 测试准确率:100.0%
测试步数: 28...  测试误差: 0.0000...  0.2675 sec/batch 测试准确率:100.0%
测试步数: 30...  测试误差: 0.0000...  0.2682 sec/batch 测试准确率:100.0%
测试步数: 32...  测试误差: 0.0000...  0.2700 sec/ba