In this notebook, we will build the CNN model for text classification.

In [3]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
import data_helpers
from word2vec import train_word2vec

In [1]:
# preprocess 

positive_data_file = "../data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "../data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file)

# Pad sentence
print("Padding sentences...")
x_text = data_helpers.pad_sentences(x_text)
print("The sequence length is: ", len(x_text[0]))

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(x_text)

# Represent sentence with word index, using word index to represent a sentence
x = data_helpers.build_index_sentence(x_text, vocabulary)
y = y.argmax(axis=1) # y: [1, 1, 1, ...., 0, 0, 0]. 1 for positive, 0 for negative

# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]

# Output shape
print('x_train shape: ', x_train.shape)
print('x_test shape:', x_test.shape)
print('Vocabulary Size: {:d}'.format(len(vocabulary_inv)))


Loading data...
Padding sentences...
The sequence length is:  56
x_train shape:  (9595, 56)
x_test shape: (1067, 56)
Vocabulary Size: 18765


In [4]:
# Word2Vec parameters (see train_word2vec)
embedding_dim = 50
min_word_count = 1
context = 10

#Prepare embedding layer weights for not-static model
embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                   min_word_count=min_word_count, context=context)

print(embedding_weights[565]) # 565 is the index word rock

Saving Word2Vec model '50feature_1minwords_10context'
[-0.22072488  0.03841191  0.25244865 -0.19596232  0.5254891  -0.22822355
 -0.00765032 -0.21729529  0.32325193 -0.1354228   0.28161174 -0.14135455
  0.25298622  0.10028931  0.13398536 -0.05369192 -0.08600403 -0.25493133
 -0.15806714  0.28666434  0.19685866  0.14603579  0.04521525 -0.4055126
 -0.3777436   0.29809853 -0.3177484  -0.12307277  0.20872054 -0.09028962
  0.30230698  0.2604237   0.5757977   0.37168625 -0.56569725  0.30448192
 -0.08910146  0.2877515   0.3957461  -0.18291692 -0.4497671   0.38631678
  0.59380317 -0.16212505 -0.33610743  0.38453627 -0.0516694   0.319073
  0.02394754 -0.23467964]


In [49]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, GlobalMaxPooling1D, Conv1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
np.random.seed(0)

In [6]:
x_test.shape[1]

56

In [17]:
#=======================Build model=========================
filter_sizes = (3, 4)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer')
print(model_input) # see the input shape. 

Tensor("model_input:0", shape=(?, 56), dtype=float32)


In [12]:
?Embedding

In [14]:
# The CNN-Non-Static has embedding layer
# Construct word embedding layer
embedding_layer = Embedding(input_dim=len(vocabulary_inv), output_dim=embedding_dim,
                      input_length=sequence_length, name='embedding_layer')(input_layer)
print(embedding_layer)


Tensor("embedding_layer/GatherV2:0", shape=(?, 56, 50), dtype=float32)


In [36]:
# Dropout
drop_layer = Dropout(dropout_prob[0], name='drop_layer1')(embedding_layer)
print(drop_layer)

Tensor("drop_layer1_1/cond/Merge:0", shape=(?, 56, 50), dtype=float32)


Stride in this context means the step of the convolution operation. For example, if you do valid convolution of two sequences of length 10 and 6, in general you get an output of length 5 (10 -6 +1). It means that sequence 2 moves “step by step” along sequence 1, using a step size of 1 when doing convolution. But if you set the stride of convolution 2, the output would be of length 3 ((10–6) / 2 + 1), meaning that sequence 2 moves “step by step” along sequence 1, using a step size of 2.

In the below code cell, we set the strides as 1, and the sequence length is 56. So after the conv, the new sequence length(new_step) should be 56-3+1=54.

As for the MaxPooling1D, here is a good [explanation](https://stackoverflow.com/questions/43728235/what-is-the-difference-between-keras-maxpooling1d-and-globalmaxpooling1d-functi). pool_size is like the kernel_szie in Conv1D, we will choose biggest number in two words vector. Because we set strides as 1, so the shape after MaxPooling1D is 54-2+1=53. If we set pool_size=2, strides=2, the shape after MaxPooling1D is 54/2=27. Because we see two words one times. 

In [47]:
# CNN, first we set filter_szies as 3, to see the output 
conv = Conv1D(filters=num_filters,
                  kernel_size=3, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
print(conv) # output (batch_size, new_steps, filters)

# Max pooling 
conv = MaxPooling1D(pool_size=2)(conv)
print(conv) # (batch_size, downsampled_steps, features)

# Flatten 
conv = Flatten()(conv)
print(conv)

Tensor("conv1d_24/Relu:0", shape=(?, 54, 10), dtype=float32)
Tensor("max_pooling1d_23/Squeeze:0", shape=(?, 27, 10), dtype=float32)
Tensor("flatten_10/Reshape:0", shape=(?, ?), dtype=float32)


In [51]:
# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
    conv = MaxPooling1D(pool_size=2, strides=1)(conv)
#     conv = Flatten()(conv)
    conv_blocks.append(conv)

In [52]:
conv_blocks

[<tf.Tensor 'max_pooling1d_24/Squeeze:0' shape=(?, 53, 10) dtype=float32>,
 <tf.Tensor 'max_pooling1d_25/Squeeze:0' shape=(?, 52, 10) dtype=float32>]

MaxPooling1D is not the 1max pooling in the original paper, this might confuse the reader. So here we choose `GlobalMaxPooling1D()` to implement the 1-max pooling. We can see that after the conv, the output shape is (?, 54, 10), here 10 is the filter number, we also take it as the features. In the code cell below, I set `the num_filters=10`. We want to select one biggest number in each filers, so `GlobalMaxPooling1D ` will select the biggest number on the `axis=1`(column), then we get a result of column vector with size 10. This is the `#1max` in the below image. 

![](http://www.joshuakim.io/wp-content/uploads/2017/12/figure.jpg)

In [50]:
# GlobalMaxPooling1D, this is the 1-max pooling in the paper
# CNN, first we set filter_szies as 3, to see the output 
conv = Conv1D(filters=num_filters,
                  kernel_size=3, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
print(conv) # output (batch_size, new_steps, filters)

# Max pooling 
conv = GlobalMaxPooling1D()(conv) # this is equal to the #1max
print(conv) # (batch_size, a max feature in a filter)

Tensor("conv1d_26/Relu:0", shape=(?, 54, 10), dtype=float32)
Tensor("global_max_pooling1d_1/Max:0", shape=(?, 10), dtype=float32)


In [58]:
# GlobalMaxPooling1D, this is the 1-max pooling in the paper
# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

print(conv_blocks)
concat1max = Concatenate()(conv_blocks)
print()
print(concat1max)

[<tf.Tensor 'global_max_pooling1d_8/Max:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'global_max_pooling1d_9/Max:0' shape=(?, 10) dtype=float32>]

Tensor("concatenate_5/concat:0", shape=(?, 20), dtype=float32)


In [61]:
output_layer = Dense(1, activation='softmax', name='output_layer')(concat1max)
output_layer

<tf.Tensor 'output_layer/Softmax:0' shape=(?, 1) dtype=float32>

In [63]:
# write in one cell 

#=======================Build model=========================
filter_sizes = (3, 4)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)

# Embedding layer, (?, 56, 50)
embedding_layer = Embedding(input_dim=len(vocabulary_inv), output_dim=embedding_dim,
                      input_length=sequence_length, name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)


concat1max = Concatenate()(conv_blocks) # (? 20)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("input_layer:0", shape=(?, 56), dtype=float32) at layer "input_layer". The following previous layers were accessed without issue: []