In [12]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [13]:
reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [14]:
# get unique word count in the reviews
unique_words = set()
for review in reviews:
    for word in review.split():
        unique_words.add(word)
print(len(unique_words))

20


In [15]:
t = Tokenizer()
t.fit_on_texts(reviews)
vocab_size = len(t.word_index) + 1
print(vocab_size)

# integer encode the documents
encoded_reviews = t.texts_to_sequences(reviews)
print(encoded_reviews)

21
[[4, 1], [5, 6], [7, 8], [9, 10, 11], [12, 2, 13], [14, 1], [15, 2, 16], [3, 17], [3, 18], [19, 20]]


In [16]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
print(padded_reviews)

[[ 4  1  0]
 [ 5  6  0]
 [ 7  8  0]
 [ 9 10 11]
 [12  2 13]
 [14  1  0]
 [15  2 16]
 [ 3 17  0]
 [ 3 18  0]
 [19 20  0]]


In [18]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefficients = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefficients
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [21]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [22]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
# We set trainable=False to prevent the embedding weights from being updated during training.
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [23]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 100)            2100      
                                                                 
 flatten (Flatten)           (None, 300)               0         
                                                                 
 dense (Dense)               (None, 1)                 301       
                                                                 
Total params: 2,401
Trainable params: 301
Non-trainable params: 2,100
_________________________________________________________________
None


In [25]:
# fit the model
model.fit(padded_reviews, sentiment, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_reviews, sentiment, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000
