In [38]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, Dense, Flatten
from keras.datasets import reuters
from gensim.models import word2vec
import numpy
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz", 
                                                         num_words=None, 
                                                         skip_top=0, 
                                                         maxlen=None, 
                                                         test_split=0.2, 
                                                         seed=113, 
                                                         start_char=1, 
                                                         oov_char=2, 
                                                         index_from=3)

In [11]:
offset = 3
reuters_map = dict((index + offset, word) for (word, index) in reuters.get_word_index().items())
reuters_map[0] = 'PADDING'
reuters_map[1] = 'START'
reuters_map[2] = 'UNKNOWN'

In [12]:
' '.join([reuters_map[word_index] for word_index in x_train[0]])

'START mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [21]:
train_sentences = [['PADDING'] + [reuters_map[word_index] for word_index in review] for review in x_train]
test_sentences = [['PADDING'] + [reuters_map[word_index] for word_index in review] for review in x_test]
# test_sentences

In [16]:
reuters_wv_model = word2vec.Word2Vec(train_sentences + test_sentences + ['UNKNOWN'], min_count=1)

In [19]:
reuters_wordvec = reuters_wv_model.wv
reuters_wv_model.wv.get_vector('snake')

reuters_wv_model.wv.vectors # list of word vectors

array([ 0.00047466, -0.01209502,  0.00511746, -0.02761546,  0.00877501,
        0.00882131, -0.00302224, -0.01023681,  0.00069265, -0.00940508,
        0.00669806,  0.01070337,  0.00157484,  0.00743781,  0.00440874,
        0.04119754, -0.01997974,  0.00022846, -0.01385045, -0.01347616,
        0.01381012,  0.02165364,  0.05029431,  0.00792547, -0.00144881,
        0.03332239,  0.02038399, -0.01507462,  0.02599304,  0.01550889,
        0.01179513,  0.00205972, -0.01331445, -0.00938397,  0.02161695,
       -0.0232148 ,  0.02559957,  0.02826326,  0.00771353,  0.02987359,
       -0.01721841,  0.03012494,  0.01141527, -0.01122399, -0.00490403,
        0.01749288,  0.01441664, -0.00345776, -0.03578991,  0.00395014,
       -0.02626677,  0.00398037,  0.00886727, -0.01310621,  0.00173385,
       -0.01089446,  0.04150049, -0.01038958,  0.01093073, -0.00141662,
       -0.00629575, -0.01702757,  0.00508912, -0.00068082,  0.02595225,
       -0.01739266, -0.0164832 , -0.00300453, -0.00601622,  0.01

(8982,)

In [28]:
# shorten and pad
lengths = [len(review) for review in x_train.tolist() + x_test.tolist()]
print('Longest review: {} Shortest review: {}'.format(max(lengths), min(lengths)))

Longest review: 2376 Shortest review: 2


In [29]:
cutoff = 500
print('{} reviews out of {} are over {}.'.format(
    sum([1 for length in lengths if length > cutoff]), 
    len(lengths), 
    cutoff))

450 reviews out of 11228 are over 500.


In [32]:
from keras.preprocessing import sequence
x_train_padded = sequence.pad_sequences(x_train, maxlen=cutoff)
x_test_padded = sequence.pad_sequences(x_test, maxlen=cutoff)

In [35]:
model = Sequential()
embedding_layer = reuters_wordvec.get_keras_embedding(train_embeddings=False)
embedding_layer.input_length = cutoff

In [36]:
model.add(embedding_layer)

In [39]:
# verify that embedding layer works the same as regular wordvec
model.predict(numpy.array([[reuters_wordvec.vocab["W"].index]]))[0][0] == reuters_wordvec["W"]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])