In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
print(tf.__version__)

In [None]:
# Load IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Downloading IMDB dataset and extracting it
dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

# Creating path to dataset directory
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [None]:
# Listing contents of dataset directory
os.listdir(dataset_dir)

In [None]:
# Creating path to training data directory
train_dir = os.path.join(dataset_dir, 'train')
# Listing contents of training data directory
os.listdir(train_dir)

In [None]:
# Creating path to a sample positive review file
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')

# Opening and reading the sample review file
with open(sample_file) as f:
  print(f.read())

In [None]:
# Creating path to directory containing unsupervised data
remove_dir = os.path.join(train_dir, 'unsup')

# Removing unsupervised data directory
shutil.rmtree(remove_dir)

In [None]:
# Setting batch size for training
batch_size = 32

# Setting seed for reproducibility
seed = 42

# Creating raw training dataset from directory
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    # Specifying directory for training data
    'aclImdb/train',
    # Setting batch size
    batch_size=batch_size,
    # Splitting data for validation
    validation_split=0.2,
    # Specifying subset for training
    subset='training',
    # Setting seed for reproducibility
    seed=seed)

In [None]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

In [None]:
# Creating raw validation dataset from directory
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    # Specifying directory for training data
    'aclImdb/train',
    # Setting batch size
    batch_size=batch_size,
    # Splitting data for validation
    validation_split=0.2,
    # Specifying subset for validation
    subset='validation',
    # Setting seed for reproducibility
    seed=seed)

In [None]:
# Creating raw test dataset from directory
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    # Specifying directory for test data
    'aclImdb/test',
    # Setting batch size
    batch_size=batch_size)

# Prepare the dataset for training

In [None]:
# Defining custom standardization function
def custom_standardization(input_data):
  # Converting text to lowercase
  lowercase = tf.strings.lower(input_data)
  # Removing HTML tags
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  # Removing punctuation
  return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')

In [None]:
# Setting maximum number of features
max_features = 10000

# Setting sequence length
sequence_length = 250

# Creating TextVectorization layer
vectorize_layer = layers.TextVectorization(
    # Using custom standardization function
    standardize=custom_standardization,
    # Setting maximum number of tokens
    max_tokens=max_features,
    # Outputting integers
    output_mode='int',
    # Setting output sequence length
    output_sequence_length=sequence_length)

In [None]:
# Make a text-only dataset (without labels), then call adapt
# Extracting text from training dataset
train_text = raw_train_ds.map(lambda x, y: x)
# Adapting vectorization layer to training text
vectorize_layer.adapt(train_text)

In [None]:
# Defining function to vectorize text
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

In [None]:
print("86 ---> ",vectorize_layer.get_vocabulary()[86]) # Printing word corresponding to index 86 in vocabulary
print(" 17 ---> ",vectorize_layer.get_vocabulary()[17]) # Printing word corresponding to index 17 in vocabulary
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Configure the dataset for performance

In [None]:
# Setting AUTOTUNE parameter for dataset performance optimization
AUTOTUNE = tf.data.AUTOTUNE

# Caching and prefetching training dataset for performance
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
# Caching and prefetching validation dataset for performance
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
# Caching and prefetching test dataset for performance
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Create the model

In [None]:
embedding_dim = 16

In [None]:
# Creating sequential model
model = tf.keras.Sequential([
  # Adding embedding layer
  layers.Embedding(max_features, embedding_dim),
  # Adding dropout layer
  layers.Dropout(0.2),
  # Adding global average pooling 1D layer
  layers.GlobalAveragePooling1D(),
  # Adding dropout layer
  layers.Dropout(0.2),
  # Adding dense layer with sigmoid activation
  layers.Dense(1, activation='sigmoid')])

model.summary()

# Loss function and optimizer

In [None]:
# Compiling model with binary crossentropy loss
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',  # Using Adam optimizer
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)])  # Using binary accuracy as evaluation metric

# Train the model

In [None]:
epochs = 10  # Setting number of epochs for training
history = model.fit(  # Training the model
    train_ds,  # Using training dataset
    validation_data=val_ds,  # Using validation dataset for validation
    epochs=epochs)  # Training for specified number of epochs

# Evaluate the model


Let's see how the model performs. Two values will be returned. Loss (a number which represents our error, lower values are better), and accuracy.

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

# Create a plot of accuracy and loss over time

In [None]:
# Extracting history dictionary from training history
history_dict = history.history
# Displaying keys in history dictionary
history_dict.keys()

In [None]:
# Extracting training accuracy
acc = history_dict['binary_accuracy']
# Extracting validation accuracy
val_acc = history_dict['val_binary_accuracy']
# Extracting training loss
loss = history_dict['loss']
# Extracting validation loss
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In this plot, the dots represent the training loss and accuracy, and the solid lines are the validation loss and accuracy.

# Export the model

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

# Inference on new data

To get predictions for new examples, you can simply call model.predict().

In [None]:
examples = tf.constant([
    "The movie was great!",
    "The movie was okay",
    "The movie was terrible..."
])
export_model.predict(examples)