In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

2.7.0


In [3]:
#Load the dataset.
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [4]:
#Split the data into train and test splits.
X_train, X_test = imdb['train'], imdb['test']

#Split the splits into sentences and labels.
X_train_labels = []
X_test_labels = []

X_train_sentences = []
X_test_sentences = []

for s, l in X_train:
    X_train_sentences.append(s.numpy().decode('utf8'))
    X_train_labels.append(l.numpy())
    

for s, l in X_test:
    X_test_sentences.append(s.numpy().decode('utf8'))
    X_test_labels.append(l.numpy())

In [5]:
#Convert the labels into numpy arrays.
X_test_labels = np.array(X_test_labels)
X_train_labels = np.array(X_train_labels)

X_test_labels.shape, X_train_labels.shape

((25000,), (25000,))

In [6]:
#Initialize global variables for making sequences and padding for the data.
vocab_size = 10000
embedding_dim = 16
trunc_type = 'post'
oov_tok = '<OOV>'
max_length = 120

In [7]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)


tokenizer.fit_on_texts(X_train_sentences)
word_index = tokenizer.word_index

train_sequnces = tokenizer.texts_to_sequences(X_train_sentences)
train_padded = pad_sequences(train_sequnces, maxlen = max_length)

test_sequences = tokenizer.texts_to_sequences(X_test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [13]:
#Compile model.
model.compile(optimizer='adam',
             metrics = ['accuracy'],
             loss = 'binary_crossentropy')

In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense_2 (Dense)             (None, 6)                 11526     
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [15]:
num_epochs = 10

model.fit(train_padded, X_train_labels,
         epochs=num_epochs,
         validation_data=(test_padded, X_test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f886f6308e0>

In [16]:
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [17]:
model1.compile(metrics = ['accuracy'],
              loss = 'binary_crossentropy',
              optimizer = 'adam')

In [18]:
model1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_4 (Dense)             (None, 6)                 102       
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [19]:
model1.fit(train_padded, X_train_labels,
         epochs=num_epochs,
         validation_data=(test_padded, X_test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f886d7d1220>

In [20]:
model.evaluate(test_padded, X_test_labels)



[0.7294515371322632, 0.8453999757766724]

In [21]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [24]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(train_padded[3]))
print(X_train_sentences[3])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <OOV> into a big arm chair and <OOV> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.


In [25]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()