In [16]:
import sys
import numpy as np
import gensim
import os
import tensorflow as tf
from gensim.models.doc2vec import Doc2Vec,LabeledSentence
# from sklearn.cross_validation import train_test_split
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import re

In [17]:
datasource_dir = r'/home/alexsun/ML/data_center/standford_large_movie_review_dataset/aclImdb/'

class LabeledLineSentence(object):
    def __init__(self, dir_name, tag='DATA'):
        self.dir_name = dir_name
        self.tag = tag
        
    @staticmethod
    def multiple_replace(text, adict):
        rx = re.compile('|'.join(map(re.escape, adict)))
        def one_xlat(match):
            return adict[match.group(0)]
        return rx.sub(one_xlat, text)
    
    @staticmethod
    def txt_process(txt):
        text_replace_dic = {'<br />':'', '\n':' ', '.':'', '?':'', '*':'', '$':''}
        return LabeledLineSentence.multiple_replace(txt, text_replace_dic)
    
    def __iter__(self):
        for uid, filename in enumerate(os.listdir(self.dir_name)):
            with open(os.path.join(self.dir_name, filename), encoding='utf-8') as f:
                new_line = LabeledLineSentence.txt_process(f.read())
                new_line = [word.lower() for word in new_line.split()]
                yield LabeledSentence(new_line ,[self.tag + str(uid)])

In [18]:

train_data_pos = LabeledLineSentence(os.path.join(datasource_dir, 'train/pos/'), 'TRAIN_POS')
print('1')
train_data_neg = LabeledLineSentence(os.path.join(datasource_dir, 'train/neg/'), 'TRAIN_NEG')
print("2")
test_data_pos = LabeledLineSentence(os.path.join(datasource_dir, 'test/pos/'), 'TEST_POS')
print("3")
test_data_neg = LabeledLineSentence(os.path.join(datasource_dir, 'test/neg/'), 'TEST_NEG')
print("4")
class All_corpus():
    def __init__(self):
        self.all_data = [train_data_neg, train_data_pos, test_data_neg, test_data_pos]
    def __iter__(self):
        for data in self.all_data:
            for item in data:
                yield item

def doc2vec_train(size=500):
    save_pickle = os.path.join(datasource_dir, 'pickles/doc2vec.pickle')
    if os.path.exists(save_pickle):
        print('doc2vec.pickle already there! we just ignore training again')
        model = Doc2Vec.load(save_pickle)
        return model

    model = Doc2Vec(min_count=5, window=10, size=size)
    total_examples = 12500
    model.build_vocab(All_corpus())
    print('vocab builded!')
    # 训练训练数据
    for x_train in [train_data_neg, train_data_pos]:
        model.train(x_train, total_examples=total_examples, epochs=5)
    print('train data all processed!')
    # 训练测试数据
    for x_test in [test_data_neg, test_data_pos]:
        model.train(x_test, total_examples=total_examples, epochs=5)
    print('test data all processed!')
    model.save(save_pickle)
    return model

doc2vec_model = doc2vec_train()
                  


1
2
3
4
doc2vec.pickle already there! we just ignore training again


In [19]:
##读取向量
def getVecs(model, corpus, size=500):
    vecs = [np.array(model.docvecs[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

# train_pos = getVecs(model, train_data_pos)

In [20]:
# print(train_pos.shape)

In [21]:
vocab_size = len(doc2vec_model.wv.vocab.keys())
keys = doc2vec_model.wv.vocab
print('total vocab size:', vocab_size)
print(type(keys))
print(keys['china'])
print(type(doc2vec_model.docvecs))

total vocab size: 61271
<class 'dict'>
Vocab(count:273, index:3148, sample_int:4294967296)
<class 'gensim.models.doc2vec.DocvecsArray'>


### RNN Model 

In [22]:
# 输入层
def build_inputs(num_seqs, vec_size=500):
    '''
    构建输入层
    
    num_seqs: 每个batch中的序列个数
    vec_size: 每个batch中向量的长度
    '''
    inputs = tf.placeholder(tf.float32, shape=(num_seqs, vec_size), name='inputs')
    targets = tf.placeholder(tf.int32, shape=(num_seqs), name='targets')
    
    # 加入keep_prob
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

In [23]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    # build a lstm cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(cell=lstm, output_keep_prob=keep_prob)
    # stack the lstm layers
    cell = tf.contrib.rnn.MultiRNNCell([drop for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state
# what's cell.zero_state().__doc__ ?
# print( tf.contrib.rnn.BasicLSTMCell(10).zero_state.__doc__)

In [57]:
def build_output(lstm_output, in_size, out_size, batch_size):
    ''' 
    构造输出层     
    lstm_output: lstm层的输出结果
    in_size: lstm输出层重塑后的size
    out_size: softmax层的size
    '''

    # 将lstm的输出按照列concate，例如[[1,2,3],[7,8,9]],
    # tf.concat的结果是[1,2,3,7,8,9]
    # seq_output = tf.concat(lstm_output, axis=1) # tf.concat(concat_dim, values)
    # reshape
    x = lstm_output
    print(x.shape)
    
    #  tf 变量共享问题总结
    #　注意力层
    # with tf.variable_scope('attention'):
    # why this shape?
    attention_w = tf.get_variable(name="attention", shape=[1,500],
                                      initializer=tf.random_normal_initializer())
    
    # copy attention_w(shape [1, 500]) to shape[batch_size, 1, 100]
    # make sure the batch_size [1, 100] matrix are value-equal
    softmax_w = tf.get_variable(name="sw", shape=[in_size, out_size],
                                      initializer=tf.random_normal_initializer())
    
    softmax_b = tf.get_variable(name="sb", shape=(out_size),
                                      initializer=tf.constant_initializer(0))
    extend_attention = tf.tile(attention_w[tf.newaxis,:,:], [batch_size, 1, 1])
    # x = tf.matmul(extend_attention, x)
    x = tf.matmul(extend_attention, x)
    x = tf.reshape(x, [-1, in_size])
    # 计算logits
    logits = tf.matmul(x, softmax_w) + softmax_b
    # softmax层返回概率分布
    out = tf.nn.softmax(logits, name='predictions')
    print(logits.shape)
    return out, logits


def build_loss(out, logits, targets, lstm_size, num_classes):
    '''
    根据logits和targets计算损失
    
    logits: 全连接层的输出结果（不经过softmax）
    targets: targets
    lstm_size
    num_classes: vocab_size
        
    '''
    # One-hot编码
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    correct_prediction = tf.equal(tf.argmax(y_reshaped, 1), tf.argmax(out, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    y_reshaped = y_one_hot
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    #y = targets
    #y = tf.reshape(y,[-1, 1])
    #loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
    #loss = tf.reduce_mean(loss)
    # loss = (tf.reduce_sum(y*tf.log(logits)) + tf.reduce_sum((1-y) * (tf.log(1-logits)))) / 100.0
    #predict = tf.greater_equal(tf.sigmoid(logits), tf.ones(shape=y.shape, dtype=tf.float32)*0.5)
    #tmp = tf.ones(shape=y.shape, dtype=tf.int32)
    #y_ = tf.equal(y,tmp)
    #correct_prediction = tf.equal(predict, y_)
    #accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy, loss


def build_optimizer(loss, learning_rate, grad_clip):
    ''' 
    构造Optimizer
   
    loss: 损失
    learning_rate: 学习率
    
    '''
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    #使用clipping gradients
    #tvars = tf.trainable_variables()
    #grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    #train_op = tf.train.AdamOptimizer(learning_rate)
    #optimizer = train_op.apply_gradients(zip(grads, tvars))
    return optimizer




In [58]:
# 执行训练准备
# 训练参数设置
batch_size = 100         # Sequences per batch
num_steps = 500          # Number of vector size
lstm_size = 128           # Size of hidden layers in LSTMs
num_layers = 1           # Number of LSTM layers
learning_rate = 0.001    # Learning rate
keep_prob = 0.5          # Dropout keep probability
num_classes = 2          #做情感分析时， 只有正向和负像情感类型


In [26]:

# 训练数据准备：
def get_batches(b_size = batch_size, vec_size = num_steps):
    total_length = 12500
    cnt = 0
    batch_perclass = 50
    pos_label = 'TRAIN_POS'
    neg_label = 'TRAIN_NEG'
    for i in range(0, total_length, batch_perclass):
        x = np.zeros([b_size, vec_size])
        y = []
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[pos_label + str(i+j)]
            y.append(1)
        for j in range(batch_perclass):
            x[j+batch_perclass:j+1+batch_perclass, :] = doc2vec_model.docvecs[neg_label + str(i+j)]
            y.append(0)
        y = np.array(y, dtype=np.int32)
        # y = y.reshape(-1, b_size)
        r = np.random.permutation(len(y))
        y = y[r]
        x = x[r, :]
        # y = y.reshape([-1, b_size])
        yield x, y

        
# 测试数据准备
def get_batches_test(b_size = batch_size, vec_size = num_steps):
    total_length = 12500
    cnt = 0
    batch_perclass = 50
    pos_label = 'TEST_POS'
    neg_label = 'TEST_NEG'
    for i in range(0, total_length, batch_perclass):
        x = np.zeros([b_size, vec_size])
        y = []
        for j in range(batch_perclass):
            x[j:j+1, :] = doc2vec_model.docvecs[pos_label + str(i+j)]
            y.append(1)
        for j in range(batch_perclass):
            x[j + batch_perclass:j+1+batch_perclass, :] = doc2vec_model.docvecs[neg_label + str(i+j)]
            y.append(0)
        y = np.array(y, dtype=np.int32)
        # y = y.reshape(-1, b_size)
        r = np.random.permutation(len(y))
        y = y[r]
        x = x[r, :]
        # y = y.reshape([-1, b_size])
        yield x, y


In [27]:
# 组合模型

class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # 如果sampling是True，则采用SGD
        if sampling == True:
            batch_size, num_steps = 1, 500
        else:
            batch_size, num_steps = batch_size, num_steps

        tf.reset_default_graph()
        
        # 输入层
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # LSTM层
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)

        ## 对输入进行one-hot编码
        # x_one_hot = tf.one_hot(self.inputs, num_classes)
        _input = tf.reshape(self.inputs, [batch_size, num_steps, -1])
        # 运行RNN
        outputs, state = tf.nn.dynamic_rnn(cell, _input, initial_state=self.initial_state,dtype=tf.float32)
        self.final_state = state
        
        # 预测结果
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes, batch_size)
        
        # Loss 和 optimizer (with gradient clipping)
        self.accuracy, self.loss = build_loss(self.prediction, self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

In [59]:

# 执行训练
import time
epochs = 20
# 每n轮进行一次变量保存
save_every_n = 500
model = CharRNN(num_classes, batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _, _acc= sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer,
                                                 model.accuracy
                                                 ], 
                                                 feed_dict=feed)
            
            end = time.time()
            # control the print lines
            if counter % 10 == 0:
                print('轮数: {}/{}... '.format(e+1, epochs),
                      '训练步数: {}... '.format(counter),
                       '训练误差: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)),
                      '准确率:{}%'.format(_acc * 100.0)
                      )

            if (counter % save_every_n == 0):
                # saver.save(sess, "./save/lstm-model", global_step=counter)
                print("saved! step:", save_every_n)


(100, 500, 128)
(100, 2)
轮数: 1/20...  训练步数: 10...  训练误差: 0.8154...  1.5080 sec/batch 准确率:55.000001192092896%
轮数: 1/20...  训练步数: 20...  训练误差: 0.7188...  1.4856 sec/batch 准确率:52.99999713897705%
轮数: 1/20...  训练步数: 30...  训练误差: 0.7302...  1.5170 sec/batch 准确率:55.000001192092896%
轮数: 1/20...  训练步数: 40...  训练误差: 0.6416...  1.5120 sec/batch 准确率:63.999998569488525%
轮数: 1/20...  训练步数: 50...  训练误差: 0.6543...  1.5232 sec/batch 准确率:60.00000238418579%
轮数: 1/20...  训练步数: 60...  训练误差: 0.6308...  1.5162 sec/batch 准确率:69.9999988079071%
轮数: 1/20...  训练步数: 70...  训练误差: 0.7593...  1.4853 sec/batch 准确率:66.00000262260437%
轮数: 1/20...  训练步数: 80...  训练误差: 0.6173...  1.4890 sec/batch 准确率:66.00000262260437%
轮数: 1/20...  训练步数: 90...  训练误差: 0.5973...  1.4993 sec/batch 准确率:68.99999976158142%
轮数: 1/20...  训练步数: 100...  训练误差: 0.6041...  1.5575 sec/batch 准确率:74.00000095367432%
轮数: 1/20...  训练步数: 110...  训练误差: 0.6962...  1.4997 sec/batch 准确率:66.00000262260437%
轮数: 1/20...  训练步数: 120...  训练误差: 0.5416...  1.5249 sec/bat

KeyboardInterrupt: 

In [None]:
def test():
    checkpoint_dir = "./save/"
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    # model = CharRNN(num_classes,lstm_size=lstm_size, sampling=True)
    with tf.Session() as sess:
        new_state = sess.run(model.initial_state)
        if ckpt and ckpt.model_checkpoint_path:  
            saver.restore(sess, ckpt.model_checkpoint_path)
            pass
        else:
            print("some error occured! drop test prcocess")
            return
        
        loss = 0
        counter = 0
        for x, y in get_batches_test(batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state,_acc= sess.run([model.loss, 
                                                 model.final_state, 
                                                 #model.optimizer,
                                                 model.accuracy
                                                 ], 
                                                 feed_dict=feed)
            end = time.time()
            if counter % 100 == 0:
                print('测试步数: {}... '.format(counter),'测试误差: {:.4f}... '.format(batch_loss),
                     '{:.4f} sec/batch'.format((end-start)),
                     '测试准确率:{}%'.format(_acc * 100.0)
                     )

                
test()