In [26]:
import tensorflow_datasets as tfds
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [11]:
for example in imdb['train'].take(1):
    print(example.dtype)

AttributeError: 'tuple' object has no attribute 'dtype'

In [48]:
import numpy as np

train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

for i in imdb['train']:
    train_sentences.append(i[0].numpy().decode('utf8'))
    train_labels.append(i[1].numpy())

for i in imdb['test']:
    test_sentences.append(i[0].numpy().decode('utf8'))
    test_labels.append(i[1].numpy())
    
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [69]:
print(len(train_sentences))

25000


In [38]:
# Parameters

vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type='post'
oov_tok = "<OOV>"

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

train_padded = pad_sequences(tokenizer.texts_to_sequences(train_sentences), maxlen=max_length, truncating=trunc_type)
test_padded = pad_sequences(tokenizer.texts_to_sequences(test_sentences), maxlen=max_length, truncating=trunc_type)

In [40]:
print(train_padded[0])

[   0    0    0   12   14   33  425  392   18   90   28    1    9   32
 1366 3585   40  486    1  197   24   85  154   19   12  213  329   28
   66  247  215    9  477   58   66   85  114   98   22 5675   12 1322
  643  767   12   18    7   33  400 8170  176 2455  416    2   89 1231
  137   69  146   52    2    1 7577   69  229   66 2933   16    1 2904
    1    1 1479 4940    3   39 3900  117 1584   17 3585   14  162   19
    4 1231  917 7917    9    4   18   13   14 4139    5   99  145 1214
   11  242  683   13   48   24  100   38   12 7181 5515   38 1366    1
   50  401   11   98 1197  867  141   10]


In [44]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=6, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(loss=tf.keras.losses.binary_crossentropy, optimizer='adam', metrics=['accuracy'])

In [49]:
num_epochs=10

model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(test_padded, test_labels))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2650981fd0>

In [63]:
model.predict(np.array([test_padded[0]]))

array([[0.40712348]], dtype=float32)

In [64]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [66]:
reverse_word_index = tokenizer.index_word

In [68]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()