In [11]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
samples = ['The cat is really beautiful.', 
           'The other side of the world',
          'Hello Boy!',
          'I hope you like me.',
          'I think it is too late to find the correct way.']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
seq = tokenizer.texts_to_sequences(samples)
for s in seq:
    print(s)
print(f'Unique worlds: {len(tokenizer.word_index)}')

[1, 4, 2, 5, 6]
[1, 7, 8, 9, 1, 10]
[11, 12]
[3, 13, 14, 15, 16]
[3, 17, 18, 2, 19, 20, 21, 22, 1, 23, 24]
Unique worlds: 24


### Padding

In [3]:
input_seq = pad_sequences(seq, 10, padding='post')
input_seq

array([[ 1,  4,  2,  5,  6,  0,  0,  0,  0,  0],
       [ 1,  7,  8,  9,  1, 10,  0,  0,  0,  0],
       [11, 12,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 13, 14, 15, 16,  0,  0,  0,  0,  0],
       [17, 18,  2, 19, 20, 21, 22,  1, 23, 24]], dtype=int32)

In [4]:
pad_sequences(seq, 10, padding='post', truncating='post')

array([[ 1,  4,  2,  5,  6,  0,  0,  0,  0,  0],
       [ 1,  7,  8,  9,  1, 10,  0,  0,  0,  0],
       [11, 12,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 13, 14, 15, 16,  0,  0,  0,  0,  0],
       [ 3, 17, 18,  2, 19, 20, 21, 22,  1, 23]], dtype=int32)

### Embedding

In [5]:
max_feat = 10_000
maxlen = 200
(train_X, train_Y), (test_X, test_Y) = imdb.load_data(num_words=max_feat)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [6]:
train_X = pad_sequences(train_X, maxlen=maxlen, padding='post')
test_X = pad_sequences(test_X, maxlen=maxlen, padding='post')

input_layer = layers.Input(shape=(maxlen))
emb_layer = layers.Embedding(max_feat, 8)(input_layer)
emb_layer = layers.Flatten()(emb_layer)
output = layers.Dense(1, activation='sigmoid')(emb_layer)
model = models.Model(input_layer, output)
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 8)            80000     
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1601      
Total params: 81,601
Trainable params: 81,601
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(train_X, train_Y, 
          epochs=10, 
          batch_size=32,
          verbose=0,
          validation_split=0.2)
scores = model.evaluate(test_X, test_Y)
print(f'Test Accuracy: {scores[1]}')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Test Accuracy: 0.8580399751663208


### IMDB

In [8]:
imdb_train = os.path.join('datasets', 'imdb', 'train')
data = []
labels = []
for cat in ['pos', 'neg']:
    tar_dir = os.path.join(imdb_train, cat)
    files = os.listdir(tar_dir)
    for file in files:
        if os.path.splitext(file)[1] == '.txt':
            file_path = os.path.join(tar_dir, file)
            with open(file_path) as f:
                data.append(f.read())
            if cat == 'pos':
                labels.append(1)
            else:
                labels.append(0)

In [9]:
print('Type of review:', labels[0])
print()
print(data[0])

Type of review: 1

I was amazed at the improvements made in an animated film. If you sit close to the screen, you will see the detail in the grass and surface structures. The detail, colors, and shading are at least an order of magnitude better than Toy Story. How they were able to pull off the shading, I will never know. I do hope that PIXAR will provide a documentary on how the film was produced so I can find out how all this was accomplished. Based on this film, I think animated films of the future will be judged on the basis of this film.


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [19]:
num_words = 10_000
maxlen = 200

imdb_tokenizer = Tokenizer(num_words=num_words)
imdb_tokenizer.fit_on_texts(X_train)
X_train_seq = imdb_tokenizer.texts_to_sequences(X_train)
X_test_seq = imdb_tokenizer.texts_to_sequences(X_test)

X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen, 
                            padding='post')
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen,
                           padding='post')

In [22]:
emb_index = {}
emb_path = os.path.join('embeddings', 'glove.6B.100d.txt')
with open(emb_path) as f:
    for row in f:
        line = row.split()
        word = line[0]
        vec = line[1:]
        emb_index[word] = np.array(vec)
print('Number of words ', len(emb_index))

Number of words  400000


In [32]:
emb_len = len(emb_index['the'])
emb_matrix = np.zeros(shape=(num_words, emb_len))
for word, i in imdb_tokenizer.word_index.items():
    if i < maxlen:
        emb_vec = emb_index.get(word)
        if emb_vec is not None:
            emb_matrix[i] = emb_vec

In [41]:
model = models.Sequential()
model.add(layers.Embedding(num_words, emb_len, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                640032    
_________________________________________________________________
dense_6 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 1,641,121
Trainable params: 1,641,121
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.layers[0].set_weights([emb_matrix])
model.layers[0].trainable = False

In [43]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_seq, Y_train,
                    epochs=10, batch_size=32,
                    validation_data=(X_test_seq, Y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
