In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZRS8HO/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZRS8HO/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZRS8HO/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
train_data, test_data = imdb["train"], imdb["test"]

In [4]:
def load_sentences(data):
    
    sentences = []
    labels = []
    
    for s, l in data:
        sentences.append(str(s.numpy()))
        labels.append(l.numpy())
    return sentences, np.array(labels)
    

In [6]:
x_train, y_train = load_sentences(train_data)
x_test, y_test = load_sentences(test_data)

print(len(y_train), len(y_test))

25000 25000


In [14]:
y_test

array([1, 1, 0, ..., 0, 1, 1])

In [7]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = "post"
oov_token = "<OOV>"

In [8]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(x_train)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, truncating=trunc_type)

In [32]:
reverse_word_index = {value:key for key, value in word_index.items()}

def decode(text):
    return " ".join([reverse_word_index.get(word, "?") for word in text])

print(decode(padded[3]))
print(x_train[3])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? b'this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <OOV> into a big arm chair and <OOV> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received '
b'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'


## Model with Embedding

In [11]:
model1 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [12]:
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [13]:
model1.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model2.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

## Fit the model

In [16]:
num_epochs = 10

In [18]:
model1.fit(padded, y_train,
          epochs=num_epochs,
          validation_data=(test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa7fc5dddd0>

In [19]:
model2.fit(padded, y_train,
          epochs=num_epochs,
          validation_data=(test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa7fc3e4450>

## Recover Embeddings

In [36]:
import io

In [37]:
for i, model in enumerate([model1, model2]):
    e = model.layers[0]
    weights = e.get_weights()[0]
    print(weights.shape)
    
    out_v = io.open('vecs_model{}.tsv'.format(i), 'w', encoding='utf-8')
    out_m = io.open('meta_model{}.tsv'.format(i), 'w', encoding='utf-8')
    for word_num in range(1, vocab_size):
        word = reverse_word_index[word_num]
        embeddings = weights[word_num]
        out_m.write(word+"\n")
        out_v.write("\t".join([str(x) for x in embeddings])+"\n")
    out_m.close()
    out_v.close()

    

(10000, 16)


AttributeError: 'str' object has no attribute 'fomat'