In [19]:
# 读取news20训练数据集
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
    
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), data_home='./')

In [20]:
# 样本数量 & target分类结果为 1 - 20
print(len(newsgroups_train.data))
print(newsgroups_train.target[:10])

11314
[ 7  4  4  1 14 16 13  3  2  4]


In [21]:
# text 预处理
# 去除符号 字母小写
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

data = []
for text in newsgroups_train.data:
    data.append(" ".join(tokenizer.tokenize(text.lower())))
    
print(data[0])

i was wondering if anyone out there could enlighten me on this car i saw the other day it was a 2 door sports car looked to be from the late 60s early 70s it was called a bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all i know if anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please e mail


In [22]:
# text 预处理
# 构建字典 将词专为index
# 然后截取固定长度作为输入 x

from tensorflow.contrib import learn
import numpy as np
max_document_length = 1000
min_frequency = 2
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, min_frequency)
x = np.array(list(vocab_processor.fit_transform(newsgroups_train.data)))

print("vocabulary size: {:d}".format(len(vocab_processor.vocabulary_)))

vocabulary size: 39551


In [23]:
print((x[0]))

[    7    26  1354    42   177    62    51   108 13015    57    14    20
   330     7   610     1    75   301    78    26     5 32125  3220   330
   963     3    15    30     1  1256 15054   799 15061    78    26   296
     5     0    21  4103    67   182   428   109  1078     1   852  9115
    26  1649    30     1   680     4     1   673    93     8    44     7
    83    73   177    40     0     5   929   283  1157  3516   183     4
  2911   159    20   330     8   209   675    25   784   422    13    16
    14    20 25542   371   330   311   663     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [24]:
# 分类结果 one-hot 处理

def one_hot_encode(x, length):
    encode_labels = np.zeros(shape=(len(x),length))
   
    for i in range(len(x)):
        label = x[i]
        a = np.zeros(length)
        a[label] = 1
        encode_labels[i] = a
    return encode_labels
y = one_hot_encode(newsgroups_train.target, len(newsgroups_train.target_names))
print(newsgroups_train.target[0:40])
print(y[0])

[ 7  4  4  1 14 16 13  3  2  4  8 19  4 14  6  0  1  7 12  5  0 10  6  2  4
  1 12  9 15  7  6 13 12 17 18 10  8 11  8 16]
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]


In [25]:
# data shuffle

np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(newsgroups_train.target)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

In [26]:
# 训练样本/测试样本 split

sample_rate = .1
dev_sample_index = -1 * int(sample_rate * float(len(newsgroups_train.target)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

print("train/test split: {:d}/{:d}".format(len(y_train), len(y_dev)))

train/test split: 10183/1131


In [27]:
# text CNN
# x = word2vec(x) 
# x[n] = maxpooling(CNN(x)), n = filter size (2,3,4)
# x = dropout(combine(x[n]))
# output = fullyconnect(x)

import tensorflow as tf
def TextCNN(input_x, dropout_keep_prob, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters):
    
    # Embedding layer
    with tf.device('/cpu:0'), tf.name_scope("embedding"):
        w0 = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W")
        embedded_chars = tf.nn.embedding_lookup(w0, input_x)
        embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

        # Add dropout
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        W = tf.Variable(tf.random_normal([num_filters_total, num_classes], mean=0.0, stddev = 0.2))
        b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
        logits = tf.nn.xw_plus_b(h_drop, W, b, name="logits")
        return logits



In [41]:
# text RNN
# x = word2vec(x) 
# x = last output of LSTM(x), layer = 1
# x = dropout(fullyconnect(x)) 
# output = fullyconnect(x)

num_layers = 1
def TextRNN(input_x, dropout_keep_prob, sequence_length, num_classes, vocab_size, embedding_size, rnnsize, fcsize):
    # Embedding layers
    w0 = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W")
    embedded_chars = tf.nn.embedding_lookup(w0, input_x)
    
     batch_size = tf.shape(outputs)[0]
        
#         lstm_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnnsize)] * num_layers), output_keep_prob=self.dropout_keep_prob)
    lstm_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnnsize)] * num_layers)
    initial_state = lstm_cell.zero_state(batch_size, tf.float32)
    outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, embedded_chars, dtype=tf.float32, initial_state=initial_state)

    batch_size = tf.shape(outputs)[0]
    index = tf.range(0, batch_size) * sequence_length + (sequence_length - 1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, rnnsize]), index)

    fc = tf.contrib.layers.fully_connected(outputs, fcsize, weights_initializer = tf.truncated_normal_initializer(stddev = 0.01), activation_fn=None)
    flat = tf.nn.dropout(fc, tf.to_float(dropout_keep_prob))
    # Final (unnormalized) scores and predictions
    w = tf.Variable(tf.random_normal([fcsize, num_classes], mean=0.0, stddev=0.1))
    b = tf.Variable(tf.random_normal([num_classes], mean=0.0, stddev=0.1))
    logits = tf.nn.xw_plus_b(flat, w, b, name="scores")
    return logits
        

In [28]:
# batch method

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [36]:
# hyper parameters
embedding_dim = 80
filter_sizes = "3,4,5"
num_filters = 400
dropout_keep_prob = 0.8
rnnsize = 500
fcsize = 400
batch_size = 90
num_epochs = 5

learning_rate = 0.01

evaluate_every = 20

num_classes=y_train.shape[1]
sequence_length=x_train.shape[1]
vocabulary_len = len(vocab_processor.vocabulary_)

# build the network graph
train_graph = tf.Graph()
with train_graph.as_default():
    lr = tf.placeholder(tf.float32, name='LearingRate')
    dropout_keep = tf.placeholder(tf.float32, name='dropout')
    input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
    input_y = tf.placeholder(tf.int32, [None, num_classes], name="input_y")
    

#     logits = TextRNN(
#         input_x,
#         dropout_keep,
#         sequence_length,
#         num_classes,
#         vocabulary_len,
#         embedding_dim,
#         rnnsize,
#         fcsize)
    
    logits = TextCNN(
        input_x,
        dropout_keep,
        sequence_length, 
        num_classes, 
        vocabulary_len, 
        embedding_dim, 
        [int(s) for s in filter_sizes.split(',')], 
        fcsize)
    
    logits = tf.identity(logits, name='logits')

    # Loss and Optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)

    # Accuracy
    index_predict = tf.argmax(logits, 1)
    index_target = tf.argmax(input_y, 1)
    correct_pred = tf.equal(index_predict, index_target)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy') 
    
    def train_step(x_batch, y_batch):
        """
        A single training step
        """
        feed_dict = {
          input_x: x_batch,
          input_y: y_batch,
          dropout_keep: dropout_keep_prob,
          lr: learning_rate
        }
        _, _loss, _accuracy = sess.run(
            [optimizer, cost, accuracy],
            feed_dict)
        time_str = datetime.datetime.now().isoformat()
        print("{}:, loss {:g}, acc {:g}".format(time_str, _loss, _accuracy))

    def dev_step(x_batch, y_batch):
        """
        Evaluates model on a dev set
        """
        feed_dict = {
          input_x: x_batch,
          input_y: y_batch,
          dropout_keep: 1.0
        }
        _loss, _accuracy = sess.run(
            [cost, accuracy],
            feed_dict)
        time_str = datetime.datetime.now().isoformat()
        print("loss {:g}, acc {:g}".format(_loss, _accuracy))

In [None]:
# train

import datetime
    
batches = batch_iter(list(zip(x_train, y_train)), batch_size, num_epochs)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    # Generate batches
    # Training loop. For each batch...
    step = 1
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        
        print("batch {}".format(step))
        train_step(x_batch, y_batch)

        if step % evaluate_every == 0:
            print("\nEvaluation:")
            dev_step(x_dev, y_dev)
            print("")
        step = step + 1
        
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
    save_model_path = './trained_model'
    path = saver.save(sess, save_model_path)
    print("Saved model checkpoint to {}\n".format(path))
        

batch 1
2017-07-20T16:47:34.630392:, loss 31.5443, acc 0.0444444
batch 2
2017-07-20T16:47:43.659439:, loss 32.6976, acc 0.0111111
batch 3
2017-07-20T16:47:53.944320:, loss 27.7159, acc 0.0777778
batch 4
2017-07-20T16:48:05.548959:, loss 24.3772, acc 0.0666667
batch 5
2017-07-20T16:48:14.578692:, loss 20.3478, acc 0.0444444
batch 6
2017-07-20T16:48:24.633118:, loss 19.7353, acc 0.0888889
batch 7
2017-07-20T16:48:36.352789:, loss 17.175, acc 0.1
batch 8
2017-07-20T16:48:46.419663:, loss 17.618, acc 0.0333333
batch 9
2017-07-20T16:48:57.665482:, loss 17.568, acc 0.0777778
batch 10
2017-07-20T16:49:07.255401:, loss 18.7636, acc 0.0333333
batch 11
2017-07-20T16:49:16.053134:, loss 18.8175, acc 0.0555556
batch 12
2017-07-20T16:49:25.193652:, loss 17.9458, acc 0.0888889
batch 13
2017-07-20T16:49:34.314766:, loss 19.3747, acc 0.0222222
batch 14
2017-07-20T16:49:44.812599:, loss 19.6929, acc 0.0333333
batch 15
2017-07-20T16:49:55.040357:, loss 17.2731, acc 0.0888889
batch 16
2017-07-20T16:50:04