In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("TensorFlow version: {}".format(tf.__version__))
print("If tensorflow version is 1.x, use tf.enable_eager_execution()")

TensorFlow version: 2.2.0
If tensorflow version is 1.x, use tf.enable_eager_execution()


## IMDB Dataset Type1

Data Source: <a href="http://ai.stanford.edu/~amaas/data/sentiment/"> IMDB Dataset </a>

In [2]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [3]:
train_data, test_data = imdb['train'], imdb['test']

The imdb reviews loaded from tensorflow datasets are not in the usable format. To convert them into rather simple and more easily readable formats, we have initialised empty lists.

In [4]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

#### Let's now move the contents from train_data and test_data into the lists

In [5]:
for sentences, labels in train_data:
    training_sentences.append(str(sentences.numpy()))
    training_labels.append(labels.numpy())

In [6]:
for sentences, labels in test_data:
    testing_sentences.append(str(sentences.numpy()))
    testing_labels.append(labels.numpy())

#### Create Numpy arrays for the labels

In [7]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

#### Tokenize the sentences

In [8]:
## Let's use this cell to introduce the hyperparameters. This way it would be easier to change them.
vocab_size=10000
embedding_dim=16
max_length=120
trunc_type='post'
oov_tok="<OOV>"

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

In [10]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)

Initially, in the word index dictionary, the words are keys and the word tokens are the values. We are going to reverse that dictionary

In [11]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(training_sentences[1])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? b'this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <OOV> into a big arm chair and <OOV> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received '
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of i

#### Let's build the neural network

In [12]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(6, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


#### Fit the model

In [13]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x13344f150>

In [14]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


#### Get the word embeddings

In [15]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [16]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('Data/vecs.tsv')
  files.download('Data/meta.tsv')

## IMDB Dataset Type2

In [19]:
imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)



In [52]:
train_data, test_data = imdb['train'], imdb['test']

In [21]:
tokenizer = info.features['text'].encoder

#### Let's see how the subwords tokenizer works

In [34]:
sample_string = 'This notebook contains IMDB datasets! It is from a TensorFlow tutorial.'

tokenized_string = tokenizer.encode(sample_string)

print("The tokenized string is {}".format(tokenized_string))
print("\nThe original string is '{}'".format(tokenizer.decode(tokenized_string)))

The tokenized string is [62, 1893, 605, 3119, 5469, 7997, 2432, 5926, 1916, 987, 90, 69, 9, 48, 4, 6307, 2327, 4043, 4265, 999, 840, 2359, 7975]

The original string is 'This notebook contains IMDB datasets! It is from a TensorFlow tutorial.'


In [35]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))

62 ----> This 
1893 ----> note
605 ----> book 
3119 ----> contains 
5469 ----> IM
7997 ----> D
2432 ----> B 
5926 ----> dat
1916 ----> ase
987 ----> ts
90 ----> ! 
69 ----> It 
9 ----> is 
48 ----> from 
4 ----> a 
6307 ----> Ten
2327 ----> sor
4043 ----> Fl
4265 ----> ow 
999 ----> tu
840 ----> tor
2359 ----> ial
7975 ----> .


#### Let's build a Classification Model

In [59]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_data.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_data.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_data))

In [41]:
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 390       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 524,237
Trainable params: 524,237
Non-trainable params: 0
_________________________________________________________________


#### Fit the model

In [62]:
num_epochs = 10

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
