# Understanding Embeddings on Texts

In [2]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.2-understanding-recurrent-neural-networks.ipynb

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)

1.8.0


In [6]:
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
max_features = 1000  # number of words to consider as features
maxlen = 20  # cut texts after this number of words (among top max_features most common words)

# each review is encoded as a sequence of word indexes
# indexed by overall frequency in the dataset
# output is 0 (negative) or 1 (positive) 
imdb = tf.keras.datasets.imdb.load_data(num_words=max_features)
(raw_input_train, y_train), (raw_input_test, y_test) = imdb

In [7]:
# tf.keras.datasets.imdb.load_data?

In [8]:
y_train.min()

0

In [9]:
y_train.max()

1

In [10]:
# 25000 texts
len(raw_input_train)

25000

In [11]:
# first text has 218 words
len(raw_input_train[0])

218

In [12]:
raw_input_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 2,
 2,
 65,
 458,
 2,
 66,
 2,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 2,
 2,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2,
 19,
 14,
 22,
 4,
 2,
 2,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 2,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2,
 2,
 16,
 480,
 66,
 2,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 2,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 2,
 15,
 256,
 4,
 2,
 7,
 2,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 2,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2,
 56,
 26,
 141,
 6,
 194,
 2,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 2,
 18,
 51,
 36,
 28,
 224,
 92,
 25,
 104,
 4,
 226,
 65,
 16,
 3

In [13]:
# tf.keras.preprocessing.sequence.pad_sequences?

In [14]:
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

input_train = tf.keras.preprocessing.sequence.pad_sequences(raw_input_train, maxlen=maxlen)
input_test = tf.keras.preprocessing.sequence.pad_sequences(raw_input_test, maxlen=maxlen)

In [15]:
input_train.shape, input_test.shape, y_train.shape, y_test.shape

((25000, 20), (25000, 20), (25000,), (25000,))

In [16]:
# left padded with zeros
# As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
input_train[0]

array([ 65,  16,  38,   2,  88,  12,  16, 283,   5,  16,   2, 113, 103,
        32,  15,  16,   2,  19, 178,  32])

In [17]:
# tf.keras.layers.Embedding?

In [18]:
embedding_dim = 3

model = tf.keras.Sequential()
# Parameters: max_features * embedding_dim 
model.add(tf.keras.layers.Embedding(name='embedding', input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))

# Output: maxlen * embedding_dim (8)
model.add(tf.keras.layers.Flatten(name='flatten'))

# binary classifier
model.add(tf.keras.layers.Dense(name='fc', units=32, activation='relu'))
model.add(tf.keras.layers.Dense(name='classifier', units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 3)             3000      
_________________________________________________________________
flatten (Flatten)            (None, 60)                0         
_________________________________________________________________
fc (Dense)                   (None, 32)                1952      
_________________________________________________________________
classifier (Dense)           (None, 1)                 33        
Total params: 4,985
Trainable params: 4,985
Non-trainable params: 0
_________________________________________________________________


In [20]:
batch_size = 128

%time history = model.fit(input_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 8.93 s


In [21]:
train_loss, train_accuracy = model.evaluate(input_train, y_train, batch_size=batch_size)
train_accuracy



0.765519999961853

In [22]:
test_loss, test_accuracy = model.evaluate(input_test, y_test, batch_size=batch_size)
test_accuracy



0.731880000038147

In [23]:
# precition
model.predict(input_test[0:5])

array([[0.6665851 ],
       [0.67587227],
       [0.15638763],
       [0.4938291 ],
       [0.98998266]], dtype=float32)

In [24]:
# ground truth
y_test[0:5]

array([0, 1, 1, 0, 1], dtype=int64)

### How does the output of the embedding look like? 

In [25]:
embedding_layer = model.get_layer('embedding')

In [26]:
model_stub= tf.keras.Model(inputs=model.input, outputs=embedding_layer.output)

In [27]:
embedding_prediction = model_stub.predict(input_test[0:5])

In [28]:
# 5 sample reviews, 500 words per review, 8 dimensions per word
embedding_prediction.shape

(5, 20, 3)

In [29]:
# 8 embedding dimensions of first word of first sample review
embedding_prediction[0][0]

array([0.18585119, 0.7063439 , 0.39324802], dtype=float32)