In this notebook, we will build the CNN model for text classification.

In [18]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
import data_helpers
from word2vec import train_word2vec

In [19]:
# preprocess 

positive_data_file = "../data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "../data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file)

# Pad sentence
print("Padding sentences...")
x_text = data_helpers.pad_sentences(x_text)
print("The sequence length is: ", len(x_text[0]))

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(x_text)

# Represent sentence with word index, using word index to represent a sentence
x = data_helpers.build_index_sentence(x_text, vocabulary)
y = y.argmax(axis=1) # y: [1, 1, 1, ...., 0, 0, 0]. 1 for positive, 0 for negative

# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]

# Output shape
print('x_train shape: ', x_train.shape)
print('x_test shape:', x_test.shape)
print('Vocabulary Size: {:d}'.format(len(vocabulary_inv)))


Loading data...
Padding sentences...
The sequence length is:  56
x_train shape:  (9595, 56)
x_test shape: (1067, 56)
Vocabulary Size: 18765


In [21]:
# Word2Vec parameters (see train_word2vec)
embedding_dim = 50
min_word_count = 1
context = 10

#Prepare embedding layer weights for not-static model
embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                   min_word_count=min_word_count, context=context)

print(embedding_weights[565]) # 565 is the index word rock

Load existing Word2Vec model '50feature_1minwords_10context'
[-0.22072488  0.03841191  0.25244865 -0.19596232  0.5254891  -0.22822355
 -0.00765032 -0.21729529  0.32325193 -0.1354228   0.28161174 -0.14135455
  0.25298622  0.10028931  0.13398536 -0.05369192 -0.08600403 -0.25493133
 -0.15806714  0.28666434  0.19685866  0.14603579  0.04521525 -0.4055126
 -0.3777436   0.29809853 -0.3177484  -0.12307277  0.20872054 -0.09028962
  0.30230698  0.2604237   0.5757977   0.37168625 -0.56569725  0.30448192
 -0.08910146  0.2877515   0.3957461  -0.18291692 -0.4497671   0.38631678
  0.59380317 -0.16212505 -0.33610743  0.38453627 -0.0516694   0.319073
  0.02394754 -0.23467964]


In [71]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, GlobalMaxPooling1D, Conv1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import regularizers
from keras.callbacks import EarlyStopping

np.random.seed(0)

In [6]:
x_test.shape[1]

56

In [17]:
#=======================Build model=========================
filter_sizes = (3, 4)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer')
print(model_input) # see the input shape. 

Tensor("model_input:0", shape=(?, 56), dtype=float32)


In [12]:
?Embedding

In [14]:
# The CNN-Non-Static has embedding layer
# Construct word embedding layer
embedding_layer = Embedding(input_dim=len(vocabulary_inv), output_dim=embedding_dim,
                      input_length=sequence_length, name='embedding_layer')(input_layer)
print(embedding_layer)


Tensor("embedding_layer/GatherV2:0", shape=(?, 56, 50), dtype=float32)


In [36]:
# Dropout
drop_layer = Dropout(dropout_prob[0], name='drop_layer1')(embedding_layer)
print(drop_layer)

Tensor("drop_layer1_1/cond/Merge:0", shape=(?, 56, 50), dtype=float32)


Stride in this context means the step of the convolution operation. For example, if you do valid convolution of two sequences of length 10 and 6, in general you get an output of length 5 (10 -6 +1). It means that sequence 2 moves “step by step” along sequence 1, using a step size of 1 when doing convolution. But if you set the stride of convolution 2, the output would be of length 3 ((10–6) / 2 + 1), meaning that sequence 2 moves “step by step” along sequence 1, using a step size of 2.

In the below code cell, we set the strides as 1, and the sequence length is 56. So after the conv, the new sequence length(new_step) should be 56-3+1=54.

As for the MaxPooling1D, here is a good [explanation](https://stackoverflow.com/questions/43728235/what-is-the-difference-between-keras-maxpooling1d-and-globalmaxpooling1d-functi). pool_size is like the kernel_szie in Conv1D, we will choose biggest number in two words vector. Because we set strides as 1, so the shape after MaxPooling1D is 54-2+1=53. If we set pool_size=2, strides=2, the shape after MaxPooling1D is 54/2=27. Because we see two words one times. 

In [47]:
# CNN, first we set filter_szies as 3, to see the output 
conv = Conv1D(filters=num_filters,
                  kernel_size=3, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
print(conv) # output (batch_size, new_steps, filters)

# Max pooling 
conv = MaxPooling1D(pool_size=2)(conv)
print(conv) # (batch_size, downsampled_steps, features)

# Flatten 
conv = Flatten()(conv)
print(conv)

Tensor("conv1d_24/Relu:0", shape=(?, 54, 10), dtype=float32)
Tensor("max_pooling1d_23/Squeeze:0", shape=(?, 27, 10), dtype=float32)
Tensor("flatten_10/Reshape:0", shape=(?, ?), dtype=float32)


In [51]:
# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
    conv = MaxPooling1D(pool_size=2, strides=1)(conv)
#     conv = Flatten()(conv)
    conv_blocks.append(conv)

In [52]:
conv_blocks

[<tf.Tensor 'max_pooling1d_24/Squeeze:0' shape=(?, 53, 10) dtype=float32>,
 <tf.Tensor 'max_pooling1d_25/Squeeze:0' shape=(?, 52, 10) dtype=float32>]

MaxPooling1D is not the 1max pooling in the original paper, this might confuse the reader. So here we choose `GlobalMaxPooling1D()` to implement the 1-max pooling. We can see that after the conv, the output shape is (?, 54, 10), here 10 is the filter number, we also take it as the features. In the code cell below, I set `the num_filters=10`. We want to select one biggest number in each filers, so `GlobalMaxPooling1D ` will select the biggest number on the `axis=1`(column), then we get a result of column vector with size 10. This is the `#1max` in the below image. 

![](http://www.joshuakim.io/wp-content/uploads/2017/12/figure.jpg)

In [50]:
# GlobalMaxPooling1D, this is the 1-max pooling in the paper
# CNN, first we set filter_szies as 3, to see the output 
conv = Conv1D(filters=num_filters,
                  kernel_size=3, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
print(conv) # output (batch_size, new_steps, filters)

# Max pooling 
conv = GlobalMaxPooling1D()(conv) # this is equal to the #1max
print(conv) # (batch_size, a max feature in a filter)

Tensor("conv1d_26/Relu:0", shape=(?, 54, 10), dtype=float32)
Tensor("global_max_pooling1d_1/Max:0", shape=(?, 10), dtype=float32)


In [58]:
# GlobalMaxPooling1D, this is the 1-max pooling in the paper
# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(drop_layer) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

print(conv_blocks)
concat1max = Concatenate()(conv_blocks)
print()
print(concat1max)

[<tf.Tensor 'global_max_pooling1d_8/Max:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'global_max_pooling1d_9/Max:0' shape=(?, 10) dtype=float32>]

Tensor("concatenate_5/concat:0", shape=(?, 20), dtype=float32)


In [61]:
output_layer = Dense(1, activation='softmax', name='output_layer')(concat1max)
output_layer

<tf.Tensor 'output_layer/Softmax:0' shape=(?, 1) dtype=float32>

In [11]:
for i, (key, value) in enumerate(embedding_weights.items()):
    if i < 3:
        print(key, value)

0 [-0.88650054 -2.160063    0.96392006 -0.26911142  2.4589605  -0.854224
 -0.10222454 -0.9868089   0.49081135 -1.3237724   0.4743087  -0.07651804
 -0.04362829  0.01618732 -0.5368535  -0.53789335 -0.4597094  -0.867225
 -0.07294422  1.3061765   1.454337    0.50052536  0.02490961 -1.0968294
 -0.90007854  0.8476956  -0.51648337 -0.4541734   0.5217278  -0.38841093
  1.1157492   1.6498995   2.7924497   2.4142728  -1.6991649   0.07605273
  0.12023915  0.389353    1.4011778  -0.36487138 -2.1056702   0.6316661
  2.2205415  -0.25847605 -0.6343487   0.84552944  0.05227763  1.5032666
  0.12233521 -0.42744634]
1 [-2.4797058  -0.65706545 -0.34030366 -0.88477564  1.9897318  -0.4754672
  0.5620251  -1.7522604   1.7267761  -0.04009154  0.9425473  -0.33500427
  1.0954219  -0.5345138   0.07658719 -1.1215198  -1.3558666  -1.5171361
 -0.54756194  3.0494883  -0.12976724  0.6956076   0.7036669  -2.0121005
 -0.6086946   2.0486102  -0.9583363  -1.5337933   2.692299   -0.12512176
  1.5774658   0.5596391   3.484

In [24]:
type(embedding_weights.values())

dict_values

In [25]:
weights = np.array([v for v in embedding_weights.values()])

In [26]:
weights.shape

(18765, 50)

In [14]:
weights[0]

array([-0.88650054, -2.160063  ,  0.96392006, -0.26911142,  2.4589605 ,
       -0.854224  , -0.10222454, -0.9868089 ,  0.49081135, -1.3237724 ,
        0.4743087 , -0.07651804, -0.04362829,  0.01618732, -0.5368535 ,
       -0.53789335, -0.4597094 , -0.867225  , -0.07294422,  1.3061765 ,
        1.454337  ,  0.50052536,  0.02490961, -1.0968294 , -0.90007854,
        0.8476956 , -0.51648337, -0.4541734 ,  0.5217278 , -0.38841093,
        1.1157492 ,  1.6498995 ,  2.7924497 ,  2.4142728 , -1.6991649 ,
        0.07605273,  0.12023915,  0.389353  ,  1.4011778 , -0.36487138,
       -2.1056702 ,  0.6316661 ,  2.2205415 , -0.25847605, -0.6343487 ,
        0.84552944,  0.05227763,  1.5032666 ,  0.12233521, -0.42744634],
      dtype=float32)

As for how to read set the pre-train weight for embedding layer, please see here: https://github.com/keras-team/keras/issues/853

In [43]:
# write in one cell 
# version 1, not converge
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 5

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
weights = np.array([v for v in embedding_weights.values()]) # assemble the embedding_weights in one numpy array
embedding_layer = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            trainable=True, # set True so embedding weight will be updated during training
                            name='embedding_layer')

embedding_layer.build((None,)) # if you don't do this, the next step won't work
embedding_layer.set_weights([weights]) # use pre-trained word vector as the weights

embedded = embedding_layer(input_layer)                  

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 20)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                  kernel_regularizer=regularizers.l2(0.01),
                  bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='softmax')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [44]:
# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 5s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 2/10
 - 4s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 3/10
 - 5s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 4/10
 - 5s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 5/10
 - 5s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 6/10
 - 5s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 7/10
 - 6s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 8/10
 - 6s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 9/10
 - 6s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911
Epoch 10/10
 - 6s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911


<keras.callbacks.History at 0x134119e48>

In [46]:
# write in one cell 
# version 2， change embedding layer
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 5

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 20)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                  kernel_regularizer=regularizers.l2(0.01),
                  bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='softmax')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/5
 - 5s - loss: 8.1122 - acc: 0.5010 - val_loss: 8.1726 - val_acc: 0.4911
Epoch 2/5
 - 5s - loss: 7.9820 - acc: 0.5010 - val_loss: 8.1211 - val_acc: 0.4911
Epoch 3/5
 - 5s - loss: 7.9585 - acc: 0.5010 - val_loss: 8.1138 - val_acc: 0.4911
Epoch 4/5
 - 5s - loss: 7.9556 - acc: 0.5010 - val_loss: 8.1132 - val_acc: 0.4911
Epoch 5/5
 - 6s - loss: 7.9554 - acc: 0.5010 - val_loss: 8.1131 - val_acc: 0.4911


<keras.callbacks.History at 0x127309cc0>

In [52]:
# write in one cell 
# version 3， activation from softmax to sigmoid
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 20)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                  kernel_regularizer=regularizers.l2(0.01),
                  bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 6s - loss: 0.9279 - acc: 0.5071 - val_loss: 0.8481 - val_acc: 0.5108
Epoch 2/10
 - 5s - loss: 0.8112 - acc: 0.5221 - val_loss: 0.7803 - val_acc: 0.5389
Epoch 3/10
 - 5s - loss: 0.7633 - acc: 0.5205 - val_loss: 0.7472 - val_acc: 0.5408
Epoch 4/10
 - 5s - loss: 0.7341 - acc: 0.5379 - val_loss: 0.7273 - val_acc: 0.5314
Epoch 5/10
 - 6s - loss: 0.7175 - acc: 0.5364 - val_loss: 0.7144 - val_acc: 0.5239
Epoch 6/10
 - 6s - loss: 0.7062 - acc: 0.5385 - val_loss: 0.7055 - val_acc: 0.5483
Epoch 7/10
 - 5s - loss: 0.6958 - acc: 0.5481 - val_loss: 0.6979 - val_acc: 0.5679
Epoch 8/10
 - 5s - loss: 0.6877 - acc: 0.5502 - val_loss: 0.6902 - val_acc: 0.5679
Epoch 9/10
 - 6s - loss: 0.6714 - acc: 0.5690 - val_loss: 0.6849 - val_acc: 0.6204
Epoch 10/10
 - 5s - loss: 0.6535 - acc: 0.5949 - val_loss: 0.6752 - val_acc: 0.6317


<keras.callbacks.History at 0x132e3f780>

In [54]:
# write in one cell 
# version 4, no l2 norm
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 20)
concat1max = Dropout(dropout_prob[1])(concat1max)
concat1max = Dense(hidden_dims, activation='relu')(concat1max)
output_layer = Dense(1, activation='sigmoid')(concat1max)

# output_layer = Dense(hidden_dims, activation="relu")(concat1max)
# output_layer = Dense(1, activation="sigmoid")(output_layer)


model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])


# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 5s - loss: 0.7236 - acc: 0.5067 - val_loss: 0.6941 - val_acc: 0.4902
Epoch 2/10
 - 5s - loss: 0.6937 - acc: 0.5062 - val_loss: 0.6937 - val_acc: 0.4799
Epoch 3/10
 - 4s - loss: 0.6935 - acc: 0.5092 - val_loss: 0.6941 - val_acc: 0.4911
Epoch 4/10
 - 6s - loss: 0.6921 - acc: 0.5109 - val_loss: 0.6928 - val_acc: 0.5023
Epoch 5/10
 - 6s - loss: 0.6905 - acc: 0.5236 - val_loss: 0.6926 - val_acc: 0.5155
Epoch 6/10
 - 7s - loss: 0.6899 - acc: 0.5179 - val_loss: 0.6914 - val_acc: 0.5417
Epoch 7/10
 - 8s - loss: 0.6865 - acc: 0.5206 - val_loss: 0.6908 - val_acc: 0.5201
Epoch 8/10
 - 7s - loss: 0.6791 - acc: 0.5281 - val_loss: 0.6885 - val_acc: 0.5586
Epoch 9/10
 - 6s - loss: 0.6717 - acc: 0.5467 - val_loss: 0.6812 - val_acc: 0.5661
Epoch 10/10
 - 5s - loss: 0.6604 - acc: 0.5619 - val_loss: 0.6767 - val_acc: 0.5754


<keras.callbacks.History at 0x139e38860>

In [63]:
# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(embedded)
    conv = MaxPooling1D(pool_size=2)(conv)
    print(conv)
    conv = Flatten()(conv)
    print(conv)
    conv_blocks.append(conv)
    
concat1max = Concatenate()(conv_blocks) # (?, 20)
print(concat1max)

Tensor("max_pooling1d_8/Squeeze:0", shape=(?, 27, 10), dtype=float32)
Tensor("flatten_8/Reshape:0", shape=(?, ?), dtype=float32)
Tensor("max_pooling1d_9/Squeeze:0", shape=(?, 24, 10), dtype=float32)
Tensor("flatten_9/Reshape:0", shape=(?, ?), dtype=float32)
Tensor("concatenate_22/concat:0", shape=(?, ?), dtype=float32)


When kernel_szie=3, the max_pooling size is `(?, 27, 10)`, means each filter extrac 27 biggest number. After flatten, the size is `(?, 270)`. 

When kernel_szie=8, the max_pooling size is `(?, 24, 10)`, means each filter extrac 24 biggest number. After flatten, the size is `(?, 240)`. 

Finally, after the concatenate, the shape is `(?, 510)`. 

In [64]:
# write in one cell 
# version 5，global max pooling to max 1d pooling, pooling size=2
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                  kernel_regularizer=regularizers.l2(0.01),
                  bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_23/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 6s - loss: 1.1724 - acc: 0.5052 - val_loss: 0.8976 - val_acc: 0.5220
Epoch 2/10
 - 5s - loss: 0.8191 - acc: 0.5177 - val_loss: 0.7665 - val_acc: 0.5098
Epoch 3/10
 - 5s - loss: 0.7398 - acc: 0.5317 - val_loss: 0.7245 - val_acc: 0.5426
Epoch 4/10
 - 6s - loss: 0.7110 - acc: 0.5379 - val_loss: 0.7047 - val_acc: 0.5633
Epoch 5/10
 - 5s - loss: 0.6969 - acc: 0.5556 - val_loss: 0.6979 - val_acc: 0.5754
Epoch 6/10
 - 5s - loss: 0.6910 - acc: 0.5659 - val_loss: 0.6849 - val_acc: 0.5886
Epoch 7/10
 - 5s - loss: 0.6730 - acc: 0.5985 - val_loss: 0.6689 - val_acc: 0.6186
Epoch 8/10
 - 5s - loss: 0.6515 - acc: 0.6246 - val_loss: 0.6285 - val_acc: 0.6954
Epoch 9/10
 - 5s - loss: 0.5951 - acc: 0.6847 - val_loss: 0.6014 - val_acc: 0.6992
Epoch 10/10
 - 5s - loss: 0.5307 - acc: 0.7535 - val_loss: 0.5790 - val_

<keras.callbacks.History at 0x13bf53ba8>

In [65]:
# write in one cell 
# version 6，no l2 reg
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu')(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_24/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 6s - loss: 0.7533 - acc: 0.5085 - val_loss: 0.6919 - val_acc: 0.5576
Epoch 2/10
 - 5s - loss: 0.6924 - acc: 0.5288 - val_loss: 0.6926 - val_acc: 0.5436
Epoch 3/10
 - 5s - loss: 0.6904 - acc: 0.5268 - val_loss: 0.6880 - val_acc: 0.5548
Epoch 4/10
 - 5s - loss: 0.6876 - acc: 0.5424 - val_loss: 0.6870 - val_acc: 0.5708
Epoch 5/10
 - 5s - loss: 0.6856 - acc: 0.5453 - val_loss: 0.6865 - val_acc: 0.5614
Epoch 6/10
 - 5s - loss: 0.6817 - acc: 0.5534 - val_loss: 0.6793 - val_acc: 0.5858
Epoch 7/10
 - 5s - loss: 0.6660 - acc: 0.5847 - val_loss: 0.6692 - val_acc: 0.6148
Epoch 8/10
 - 6s - loss: 0.6441 - acc: 0.6217 - val_loss: 0.6385 - val_acc: 0.6776
Epoch 9/10
 - 6s - loss: 0.6078 - acc: 0.6551 - val_loss: 0.6017 - val_acc: 0.6795
Epoch 10/10
 - 6s - loss: 0.5663 - acc: 0.6923 - val_loss: 0.5672 - val_

<keras.callbacks.History at 0x13ac5d588>

In [67]:
# write in one cell 
# version 7，add/delete dropout
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)
# # Dropout
# embedded = Dropout(dropout_prob[0])(embedded)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
# concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu')(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_26/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 6s - loss: 0.6962 - acc: 0.5290 - val_loss: 0.6971 - val_acc: 0.5305
Epoch 2/10
 - 4s - loss: 0.6691 - acc: 0.5895 - val_loss: 0.6867 - val_acc: 0.5726
Epoch 3/10
 - 5s - loss: 0.6294 - acc: 0.6502 - val_loss: 0.6720 - val_acc: 0.5970
Epoch 4/10
 - 5s - loss: 0.4882 - acc: 0.7713 - val_loss: 0.6478 - val_acc: 0.6767
Epoch 5/10
 - 5s - loss: 0.2457 - acc: 0.9030 - val_loss: 0.5829 - val_acc: 0.7320
Epoch 6/10
 - 5s - loss: 0.1126 - acc: 0.9610 - val_loss: 0.6681 - val_acc: 0.7310
Epoch 7/10
 - 5s - loss: 0.0568 - acc: 0.9821 - val_loss: 0.7781 - val_acc: 0.7488
Epoch 8/10
 - 6s - loss: 0.0269 - acc: 0.9937 - val_loss: 0.8768 - val_acc: 0.7376
Epoch 9/10
 - 5s - loss: 0.0125 - acc: 0.9978 - val_loss: 0.9962 - val_acc: 0.7423
Epoch 10/10
 - 5s - loss: 0.0062 - acc: 0.9993 - val_loss: 1.1184 - val_

<keras.callbacks.History at 0x14028ce80>

In [68]:
# write in one cell 
# version 8，set l2 reg in the final sigmoid layer
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu')(concat1max)
output_layer = Dense(1, activation='sigmoid', 
                     kernel_regularizer=regularizers.l2(0.01),
                     bias_regularizer=regularizers.l1(0.01))(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_27/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/10
 - 6s - loss: 0.7376 - acc: 0.5087 - val_loss: 0.7040 - val_acc: 0.5258
Epoch 2/10
 - 5s - loss: 0.7024 - acc: 0.5201 - val_loss: 0.7023 - val_acc: 0.5080
Epoch 3/10
 - 5s - loss: 0.6989 - acc: 0.5263 - val_loss: 0.6971 - val_acc: 0.5380
Epoch 4/10
 - 5s - loss: 0.6946 - acc: 0.5423 - val_loss: 0.6945 - val_acc: 0.5576
Epoch 5/10
 - 5s - loss: 0.6901 - acc: 0.5432 - val_loss: 0.6899 - val_acc: 0.5567
Epoch 6/10
 - 6s - loss: 0.6858 - acc: 0.5585 - val_loss: 0.6872 - val_acc: 0.5783
Epoch 7/10
 - 5s - loss: 0.6792 - acc: 0.5772 - val_loss: 0.6820 - val_acc: 0.5839
Epoch 8/10
 - 5s - loss: 0.6668 - acc: 0.5991 - val_loss: 0.6723 - val_acc: 0.5923
Epoch 9/10
 - 5s - loss: 0.6419 - acc: 0.6312 - val_loss: 0.6452 - val_acc: 0.6354
Epoch 10/10
 - 5s - loss: 0.5720 - acc: 0.7004 - val_loss: 0.5748 - val_

<keras.callbacks.History at 0x13fbaeef0>

In [76]:
# write in one cell 
# version 9，filter_sizes = (3, 4, 5), num_filters = 100
#=======================Build model=========================
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 4, 5)
num_filters = 100
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 100

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  activation='relu',
                  use_bias=True)(embedded) 
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                     kernel_regularizer=regularizers.l2(0.01),
                     bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=[earlystopper],
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_34/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/100
 - 20s - loss: 1.0627 - acc: 0.5007 - val_loss: 0.8224 - val_acc: 0.4911
Epoch 2/100
 - 17s - loss: 0.7809 - acc: 0.5019 - val_loss: 0.7574 - val_acc: 0.5108
Epoch 3/100
 - 16s - loss: 0.7579 - acc: 0.5349 - val_loss: 0.7626 - val_acc: 0.5351
Epoch 4/100
 - 20s - loss: 0.7507 - acc: 0.5472 - val_loss: 0.7591 - val_acc: 0.5633
Epoch 5/100
 - 17s - loss: 0.7356 - acc: 0.5851 - val_loss: 0.7399 - val_acc: 0.5951
Epoch 6/100
 - 17s - loss: 0.7168 - acc: 0.6222 - val_loss: 0.7294 - val_acc: 0.6317
Epoch 7/100
 - 17s - loss: 0.6553 - acc: 0.7149 - val_loss: 0.6599 - val_acc: 0.7142
Epoch 8/100
 - 19s - loss: 0.4602 - acc: 0.8400 - val_loss: 0.6213 - val_acc: 0.7413
Epoch 9/100
 - 16s - loss: 0.3230 - acc: 0.8996 - val_loss: 0.6494 - val_acc: 0.7432
Epoch 10/100
 - 17s - loss: 0.2418 - acc: 0.9302 - val

<keras.callbacks.History at 0x1154742e8>

In [79]:
# write in one cell 
# version 10，BatchNormalization
#=======================Build model=========================
from keras.layers import BatchNormalization, Activation

# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 100

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10


# Input 
sequence_length = x_test.shape[1] # 56
input_shape = (sequence_length,)
input_layer = Input(shape=input_shape, name='input_layer') # (?, 56)


# Embedding 
embedded = Embedding(input_dim=len(vocabulary_inv), 
                            output_dim=embedding_dim,
                            input_length=sequence_length, 
                            name='embedding_layer')(input_layer)

# CNN, iterate filter_size
conv_blocks = []
for fz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                  kernel_size=fz, # 3 means 3 words
                  padding='valid', # valid means no padding
                  strides=1, # see explnation above
                  use_bias=True)(embedded)
    conv = BatchNormalization()(conv)
    conv = Activation('relu')(conv)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

concat1max = Concatenate()(conv_blocks) # (?, 510)
print(concat1max)
concat1max = Dropout(dropout_prob[1])(concat1max)
output_layer = Dense(hidden_dims, activation='relu', 
                     kernel_regularizer=regularizers.l2(0.01),
                     bias_regularizer=regularizers.l1(0.01))(concat1max)
output_layer = Dense(1, activation='sigmoid')(output_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train the model
earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=[earlystopper],
          validation_data=(x_test, y_test), verbose=2)

Tensor("concatenate_35/concat:0", shape=(?, ?), dtype=float32)
Initializing embedding layer with word2vec weights, shape (18765, 50)
Train on 9595 samples, validate on 1067 samples
Epoch 1/100
 - 9s - loss: 1.4122 - acc: 0.5034 - val_loss: 1.1174 - val_acc: 0.5098
Epoch 2/100
 - 7s - loss: 1.0165 - acc: 0.5086 - val_loss: 0.9147 - val_acc: 0.5248
Epoch 3/100
 - 7s - loss: 0.8606 - acc: 0.5229 - val_loss: 0.8109 - val_acc: 0.5342
Epoch 4/100
 - 7s - loss: 0.7822 - acc: 0.5355 - val_loss: 0.7583 - val_acc: 0.5314
Epoch 5/100
 - 7s - loss: 0.7412 - acc: 0.5472 - val_loss: 0.7238 - val_acc: 0.5820
Epoch 6/100
 - 7s - loss: 0.7181 - acc: 0.5646 - val_loss: 0.7121 - val_acc: 0.5736
Epoch 7/100
 - 8s - loss: 0.7061 - acc: 0.5696 - val_loss: 0.7109 - val_acc: 0.5361
Epoch 8/100
 - 8s - loss: 0.6915 - acc: 0.5919 - val_loss: 0.6864 - val_acc: 0.5989
Epoch 9/100
 - 7s - loss: 0.6786 - acc: 0.6097 - val_loss: 0.6799 - val_acc: 0.6373
Epoch 10/100
 - 7s - loss: 0.6344 - acc: 0.6628 - val_loss: 0.6

<keras.callbacks.History at 0x144d7dda0>