In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

**Note:** This notebook can run using TensorFlow 2.5.0

In [2]:
#!pip install tensorflow==2.5.0

In [3]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [6]:
# If the import fails, run this
!pip install -q tensorflow-datasets

import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews/subwords32k", with_info=True, as_supervised=True)




[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/alexeykvashchuk/tensorflow_datasets/imdb_reviews/subwords32k/1.0.0...[0m


Dl Completed...: |          | 0/0 [00:00<?, ? url/s]

Dl Size...: |          | 0/0 [00:00<?, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /Users/alexeykvashchuk/tensorflow_datasets/imdb_reviews/subwords32k/1.0.0.incompleteQ01W6N/imdb_revi…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /Users/alexeykvashchuk/tensorflow_datasets/imdb_reviews/subwords32k/1.0.0.incompleteQ01W6N/imdb_revi…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /Users/alexeykvashchuk/tensorflow_datasets/imdb_reviews/subwords32k/1.0.0.incompleteQ01W6N/imdb_revi…



[1mDataset imdb_reviews downloaded and prepared to /Users/alexeykvashchuk/tensorflow_datasets/imdb_reviews/subwords32k/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
train_data

In [None]:
tokenizer = info.features['text'].encoder

In [10]:
print(len(tokenizer.subwords))

32393


In [11]:
sample_string = 'TensorFlow, from basics to mastery'

tokenized_string = tokenizer.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print ('The original string: {}'.format(original_string))


Tokenized string is [31789, 642, 9656, 32513, 2, 44, 29732, 7, 3851, 32515]
The original string: TensorFlow, from basics to mastery


In [12]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))

31789 ----> Tens
642 ----> or
9656 ----> Flo
32513 ----> w
2 ----> , 
44 ----> from 
29732 ----> basics 
7 ----> to 
3851 ----> master
32515 ----> y


In [13]:
BUFFER_SIZE = 10000
BATCH_SIZE = 16

train_dataset = train_data.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_data.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_data))

In [14]:
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.Conv1D(1, 5, padding='same'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    #tf.keras.layers.GlobalAveragePooling1D(),
    #tf.keras.layers.GlobalMaxPooling1D(),
    #tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          522400    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 1)           81        
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                8704      
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 531,250
Trainable params: 531,250
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 10

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)

Epoch 1/10

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, tokenizer.vocab_size):
  word = tokenizer.decode([word_num])
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')