In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

We have a simple review set and we will be trying to predict whether the review is positive or negative. We will use the bag of words approach to convert the text into a numerical representation. We will then use a simple logistic regression model to predict the sentiment of the review.

In [20]:
reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [34]:
# get unique word count in the reviews
unique_words = set()
for review in reviews:
    for word in review.split():
        unique_words.add(word)
print(len(unique_words))

20


Specify a large enough vocabulary size to ensure collisions are minimized. We will use a vocabulary size of 50 since we only have 20 unique words in our vocabulary.

In [21]:
vocab_size = 50
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
print(encoded_reviews)

[[47, 36], [29, 18], [45, 4], [37, 1, 31], [25, 12, 12], [36, 36], [15, 12, 45], [10, 49], [10, 38], [23, 34]]


Length of each review is different. We need to use padding to make sure all the reviews have the same length. We will use the Keras pad_sequences() function to do this.

In [22]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
padded_reviews

array([[47, 36,  0],
       [29, 18,  0],
       [45,  4,  0],
       [37,  1, 31],
       [25, 12, 12],
       [36, 36,  0],
       [15, 12, 45],
       [10, 49,  0],
       [10, 38,  0],
       [23, 34,  0]])

In [23]:
embedded_vector_length = 5

model = Sequential()
model.add(Embedding(vocab_size, embedded_vector_length, input_length=max_length, name='embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [24]:
X = padded_reviews
y = sentiment

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 5)              250       
                                                                 
 flatten_1 (Flatten)         (None, 15)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 16        
                                                                 
Total params: 266
Trainable params: 266
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x23b34c39cd0>

In [30]:
# evaluate the model
loss, accuracy = model.evaluate(X, y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


We can get embedding vectors for each word in our vocabulary. We will get Keras Embedding layer weights to do this.

In [28]:
embeddings = model.get_layer('embedding').get_weights()[0]
embeddings.shape

(50, 5)

In [31]:
embeddings[47]

array([-0.0260548 ,  0.04886741,  0.09089474,  0.09366418, -0.04695602],
      dtype=float32)

In [32]:
embeddings[29]

array([-0.00847477,  0.03921034,  0.01776346,  0.05123146, -0.01990897],
      dtype=float32)