In [37]:
import numpy as np
from tensorflow.keras import models, layers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
samples = ['The cat is really beautiful.', 
           'The other side of the world',
          'Hello Boy!',
          'I hope you like me.',
          'I think it is too late to find the correct way.']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
seq = tokenizer.texts_to_sequences(samples)
for s in seq:
    print(s)
print(f'Unique worlds: {len(tokenizer.word_index)}')

[1, 4, 2, 5, 6]
[1, 7, 8, 9, 1, 10]
[11, 12]
[3, 13, 14, 15, 16]
[3, 17, 18, 2, 19, 20, 21, 22, 1, 23, 24]
Unique worlds: 24


### Padding

In [36]:
input_seq = pad_sequences(seq, 10, padding='post')
input_seq

array([[ 1,  4,  2,  5,  6,  0,  0,  0,  0,  0],
       [ 1,  7,  8,  9,  1, 10,  0,  0,  0,  0],
       [11, 12,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 13, 14, 15, 16,  0,  0,  0,  0,  0],
       [17, 18,  2, 19, 20, 21, 22,  1, 23, 24]], dtype=int32)

In [33]:
pad_sequences(seq, 10, padding='post', truncating='post')

array([[ 1,  4,  2,  5,  6,  0,  0,  0,  0,  0],
       [ 1,  7,  8,  9,  1, 10,  0,  0,  0,  0],
       [11, 12,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 13, 14, 15, 16,  0,  0,  0,  0,  0],
       [ 3, 17, 18,  2, 19, 20, 21, 22,  1, 23]], dtype=int32)

### Embedding

In [41]:
max_feat = 10_000
maxlen = 200
(train_X, train_Y), (test_X, test_Y) = imdb.load_data(num_words=max_feat)

In [58]:
train_X = pad_sequences(train_X, maxlen=maxlen, padding='post')
test_X = pad_sequences(test_X, maxlen=maxlen, padding='post')

input_layer = layers.Input(shape=(maxlen))
emb_layer = layers.Embedding(max_feat, 8)(input_layer)
emb_layer = layers.Flatten()(emb_layer)
output = layers.Dense(1, activation='sigmoid')(emb_layer)
model = models.Model(input_layer, output)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 200, 8)            80000     
_________________________________________________________________
flatten_3 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1601      
Total params: 81,601
Trainable params: 81,601
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(train_X, train_Y, 
          epochs=10, 
          batch_size=32,
          verbose=0,
          validation_split=0.2)
scores = model.evaluate(test_X, test_Y)
print(f'Test Accuracy: {scores[1]}')

Test Accuracy: 0.8492000102996826
