In [1]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
import data_helpers
from word2vec import train_word2vec

In [2]:
# preprocess 

positive_data_file = "../data/rt-polaritydata/rt-polarity.pos"
negtive_data_file = "../data/rt-polaritydata/rt-polarity.neg"

# Load data
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(positive_data_file, negtive_data_file)

# Pad sentence
print("Padding sentences...")
x_text = data_helpers.pad_sentences(x_text)
print("The sequence length is: ", len(x_text[0]))

# Build vocabulary
vocabulary, vocabulary_inv = data_helpers.build_vocab(x_text)

# Represent sentence with word index, using word index to represent a sentence
x = data_helpers.build_index_sentence(x_text, vocabulary)
y = y.argmax(axis=1) # y: [1, 1, 1, ...., 0, 0, 0]. 1 for positive, 0 for negative

# Shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train and test
training_rate = 0.9
train_len = int(len(y) * training_rate)
x_train = x_shuffled[:train_len]
y_train = y_shuffled[:train_len]
x_test = x_shuffled[train_len:]
y_test = y_shuffled[train_len:]

# Output shape
print('x_train shape: ', x_train.shape)
print('x_test shape:', x_test.shape)
print('Vocabulary Size: {:d}'.format(len(vocabulary_inv)))


Loading data...
Padding sentences...
The sequence length is:  56
x_train shape:  (9595, 56)
x_test shape: (1067, 56)
Vocabulary Size: 18765


In [3]:
# Word2Vec parameters (see train_word2vec)
embedding_dim = 300
min_word_count = 1
context = 10

#Prepare embedding layer weights for not-static model
embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                   min_word_count=min_word_count, context=context)

print(embedding_weights[565]) # 565 is the index word rock

Saving Word2Vec model '300feature_1minwords_10context'
[ 0.09885646 -0.09516437 -0.05571043 -0.14348853  0.00774352  0.06113464
 -0.07387891 -0.14808127 -0.10660333  0.08960743  0.09765676  0.10535081
  0.06238165 -0.16934377  0.00325949  0.17728482  0.03665325 -0.17558283
 -0.14631864  0.12002646 -0.26050505 -0.2420865   0.07576365  0.2746722
  0.09456128 -0.0821784  -0.07241417 -0.14664602 -0.06794547 -0.22714663
  0.17995358 -0.02002932 -0.14626063  0.07084303  0.09672233  0.16242114
 -0.15679023 -0.17761318  0.16524783  0.16408229 -0.1398159   0.08598263
  0.1166006  -0.00138887  0.01759337  0.08898795 -0.00109552  0.00884543
 -0.10420994 -0.01112607 -0.05976019 -0.16435646  0.05247753 -0.14771546
 -0.08459281  0.13468072  0.13662091  0.00973886 -0.02802863  0.04633231
  0.06382972  0.08765724 -0.05602548  0.04169456  0.10608517  0.12137013
  0.09476804  0.07249533 -0.10288695  0.0228219   0.01481075  0.1044543
 -0.09729025  0.09143579 -0.06881186 -0.07392979  0.04186718  0.1086405

In [6]:
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, GlobalMaxPooling1D, Conv1D, Embedding
from keras.layers.merge import Concatenate
from keras import regularizers
import numpy as np

np.random.seed(0)

def create_base_model(vocab_size, embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, sequence_length):
    # Input
    input_shape = (sequence_length,)
    input_layer = Input(shape=input_shape, name='input_layer')  # (?, 56)

    # Embedding
    embedded = Embedding(input_dim=vocab_size,
                         output_dim=embedding_dim,
                         input_length=sequence_length,
                         name='embedding_layer')(input_layer) # (batch_size, sequence_length, output_dim)=(?, 56, 50),

    # CNN, iterate filter_size
    conv_blocks = []
    for fz in filter_sizes:
        conv = Conv1D(filters=num_filters,
                      kernel_size=fz,
                      padding='valid',  # valid means no padding
                      strides=1,
                      activation='relu',
                      use_bias=True)(embedded)
        conv = GlobalMaxPooling1D()(conv) # 1-Max pooling 
        conv_blocks.append(conv)

    concat1max = Concatenate()(conv_blocks)
    concat1max = Dropout(dropout_prob[1])(concat1max) # 0.8
    output_layer = Dense(hidden_dims, activation='relu',
                         kernel_regularizer=regularizers.l2(0.01),
                         bias_regularizer=regularizers.l1(0.01))(concat1max) # (?, 50)
    output_layer = Dense(1, activation='sigmoid')(output_layer) # (?, 1)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
# Model Hyperparameters
embedding_dim = 300
filter_sizes = (3, 4, 5)
num_filters = 100
dropout_prob = (0.5, 0.8)
hidden_dims = 50
vocab_size = len(vocabulary_inv)
batch_size = 64
num_epochs = 10

# Create model
sequence_length = x_test.shape[1]  # 56
model = create_base_model(vocab_size, embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, sequence_length)

# Initialize weights with word2vec
weights = np.array([v for v in embedding_weights.values()])
print("Initializing embedding layer with word2vec weights, shape", weights.shape)
embedding_layer = model.get_layer("embedding_layer")
embedding_layer.set_weights([weights])

# Train model with Early Stopping
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_split=0.1, verbose=2)


Initializing embedding layer with word2vec weights, shape (18765, 300)
Train on 8635 samples, validate on 960 samples
Epoch 1/10
 - 70s - loss: 1.2636 - acc: 0.5187 - val_loss: 0.9801 - val_acc: 0.5479
Epoch 2/10
 - 58s - loss: 0.9001 - acc: 0.5180 - val_loss: 0.8283 - val_acc: 0.5052
Epoch 3/10
 - 62s - loss: 0.7938 - acc: 0.5353 - val_loss: 0.7629 - val_acc: 0.5510
Epoch 4/10
 - 68s - loss: 0.7444 - acc: 0.5421 - val_loss: 0.7292 - val_acc: 0.5615
Epoch 5/10
 - 59s - loss: 0.7163 - acc: 0.5600 - val_loss: 0.7092 - val_acc: 0.5792
Epoch 6/10
 - 65s - loss: 0.6806 - acc: 0.5942 - val_loss: 0.6820 - val_acc: 0.5969
Epoch 7/10
 - 65s - loss: 0.5809 - acc: 0.7060 - val_loss: 0.5920 - val_acc: 0.7083
Epoch 8/10
 - 58s - loss: 0.4106 - acc: 0.8225 - val_loss: 0.6186 - val_acc: 0.7156
Epoch 9/10
 - 69s - loss: 0.3067 - acc: 0.8777 - val_loss: 0.6390 - val_acc: 0.7406
Epoch 10/10
 - 62s - loss: 0.2217 - acc: 0.9170 - val_loss: 0.7577 - val_acc: 0.7448


<keras.callbacks.History at 0x12728e978>

In [12]:
# Evaluate
score = model.evaluate(x_test, y_test)
print(score)

[0.8646195792883719, 0.7019681348461019]


In [32]:
#================Save and Load=================
from os.path import join, exists, split
from keras.models import model_from_json

# Save model
model_dir = 'models'
model_name = 'base_non_static_cnn.json'
model_name = join(model_dir, model_name)
model_weights = 'base_non_static_cnn.h5'
model_weights = join(model_dir, model_weights)

if not exists(model_name):
    os.mkdir(model_dir)
    
    print('Saving non static cnn model and its in \'%s\'' % split(model_name)[0])
    # Serialize model to JSON
    model_json = model.to_json()
    with open(model_name, 'w') as json_file:
        json_file.write(model_json)
    # Serialize weights to HDF5
    model.save_weights(model_weights)
else:
    # Load json and create model
    with open(model_name, 'r') as json_file:
        loaded_model_json = json_file.read()
    loaded_model = model_from_json(loaded_model_json)
    # Load weights into new model
    loaded_model.load_weights(model_weights)
    print('Loaded existing model from \'%s\'' % model_name)

# Evaluate
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(x_test, y_test) # Must compile before evaluate
print(score)

Loaded existing model from 'models/base_non_static_cnn.json'
[0.8646195792883719, 0.7019681348461019]


After loading the model, if we want to use `evaluate` function, we still have to compile the model. Here we just use `predcit` function, and use sklearn metrics to evaluate the result.

In [21]:
prediction = model.predict(x_test)
prediction

array([[0.25923103],
       [0.15802522],
       [0.44756457],
       ...,
       [0.9516991 ],
       [0.9999974 ],
       [0.01045262]], dtype=float32)

In [26]:
prediction = prediction.flatten()
prediction

array([0.25923103, 0.15802522, 0.44756457, ..., 0.9516991 , 0.9999974 ,
       0.01045262], dtype=float32)

In [27]:
prediction = np.where(prediction > 0.5, 1, 0)
prediction

array([0, 0, 0, ..., 1, 1, 0])

In [20]:
y_test

array([1, 1, 1, ..., 0, 1, 0])

In [28]:
from sklearn.metrics import accuracy_score

# Prediciton
prediction = model.predict(x_test)
prediction = prediction.flatten()
prediction = np.where(prediction > 0.5, 1, 0)
score = accuracy_score(y_test, prediction)
print(score)

0.7019681349578257


In [29]:
from sklearn.metrics import accuracy_score

# Prediciton
prediction = loaded_model.predict(x_test)
prediction = prediction.flatten()
prediction = np.where(prediction > 0.5, 1, 0)
score = accuracy_score(y_test, prediction)
print(score)

0.7019681349578257
