## IMDB with Datasets and Preprocessing Layers

Solution to exercise about unprocessed IMDB reviews.

In [None]:
import os
import shutil
import random

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf


In [None]:
file_path = tf.keras.utils.get_file(
  origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
  extract=True,
  cache_dir="."
)

### Step 2: Create validation directory

In [None]:
# Create validation directories
os.makedirs("datasets/aclImdb/val/pos", exist_ok=True)
os.makedirs("datasets/aclImdb/val/neg", exist_ok=True)

In [None]:
neg_reviews = os.listdir("datasets/aclImdb/test/neg")
random.shuffle(neg_reviews)
pos_reviews = os.listdir("datasets/aclImdb/test/pos")
random.shuffle(pos_reviews)

# Use assert to prevent moving files twice
assert len(neg_reviews) == 12500 and len(pos_reviews) == 12500
for file_name in neg_reviews[:7500]:
  shutil.move("datasets/aclImdb/test/neg/" + file_name,
              "datasets/aclImdb/val/neg/")

for file_name in pos_reviews[:7500]:
  shutil.move("datasets/aclImdb/test/pos/" + file_name,
              "datasets/aclImdb/val/pos/")

## Step 3: Create tf.data.Dataset objects

In [None]:
def create_dataset_1(base_dir):
  # Approach 1: read all reviews into a list and use from_tensor_slices.
  # base_dir: directory name like "./datasets/aclImdb/train"
  reviews = []
  sentiments = []
  for sentiment in ["pos", "neg"]:
    directory = os.path.join(base_dir, sentiment)
    for file_path in os.listdir(directory):
      with open(os.path.join(directory, file_path), "r") as file:
        reviews.append(file.readlines())
      sentiments.append(1.0 if sentiment == "pos" else 0.0)

  return tf.data.Dataset.from_tensor_slices((reviews, sentiments))

In [None]:
for X, y in create_dataset_1("./datasets/aclImdb/test").take(3):
    print(X)
    print(y)
    print("*"*50)

In [None]:
def create_dataset_2(base_dir):
  pos_file_paths = [os.path.join(base_dir, "pos", f) for f in os.listdir(os.path.join(base_dir, "pos"))]
  neg_file_paths = [os.path.join(base_dir, "neg", f) for f in os.listdir(os.path.join(base_dir, "neg"))]

  pos_ds = tf.data.TextLineDataset(pos_file_paths, num_parallel_reads=4).map(lambda review : (review, 1.0))
  neg_ds = tf.data.TextLineDataset(neg_file_paths, num_parallel_reads=4).map(lambda review : (review, 0.0))

  return pos_ds.concatenate(neg_ds)

In [None]:
for X, y in create_dataset_2("./datasets/aclImdb/test").take(3):
    print(X)
    print(y)
    print("*"*50)

In [None]:
%timeit -r1 for X, y in create_dataset_1("./datasets/aclImdb/train"): pass

In [None]:
%timeit -r1 for X, y in create_dataset_2("./datasets/aclImdb/train"): pass

In [None]:
BATCH_SIZE=512
train_ds = (
    create_dataset_2("./datasets/aclImdb/train").
    shuffle(buffer_size=15_000, seed=42).
    batch(BATCH_SIZE).
    prefetch(1)
)
val_ds = (
    create_dataset_2("./datasets/aclImdb/val").
    batch(BATCH_SIZE).
    prefetch(1)
)
test_ds = (
    create_dataset_2("./datasets/aclImdb/test").
    batch(BATCH_SIZE).
    prefetch(1)
)



### Step 4: Create and train a model with multi-hot encoding

In [None]:
VOCAB_SIZE=10_000
multi_hot_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="multi_hot"
)
multi_hot_layer.adapt(train_ds.map(lambda review, sentiment : review))


In [None]:
multi_hot_layer(["This movie was great", "Terrible!"])

In [None]:
def get_model(conversion_layer):
    model = tf.keras.Sequential()

    model.add(conversion_layer)

    model.add(tf.keras.layers.Dense(units=16, activation='relu',
                                    kernel_initializer="he_uniform"))
    model.add(tf.keras.layers.Dense(units=16, activation='relu',
                                    kernel_initializer="he_uniform"))
    model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

    return model

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=5,
    restore_best_weights=True
)

In [None]:
model = get_model(multi_hot_layer)

In [None]:
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs=100,
    callbacks=[early_stopping],
)

In [None]:
def plot_learning_curves(history):
    plt.figure(figsize=(8, 5))
    for key, style in zip(history.history, ["r-o", "r-*", "b-o", "b-*"]):
        epochs = np.array(history.epoch)
        plt.plot(epochs + 1, history.history[key], style, label=key)
    plt.xlabel("Epoch")
    plt.axis([1, len(history.history['loss']), 0., 1])
    plt.legend(loc="lower left")
    plt.grid()

In [None]:
plot_learning_curves(history)

In [None]:
model.evaluate(val_ds)

### Step 5: Create and Train a Model with TF-IDF

In [None]:
tf_idf_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="tf_idf"
)
tf_idf_layer.adapt(train_ds.map(lambda review, sentiment : review))

In [None]:
model2 = get_model(tf_idf_layer)
model2.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
history = model2.fit(
    train_ds,
    validation_data = val_ds,
    epochs=100,
    callbacks=[early_stopping],
)

In [None]:
model2.evaluate(val_ds)

In [None]:
model2.summary()

### Step 6: Create a Custom Embedding Layer

In [None]:
int_layer = tf.keras.layers.TextVectorization(
    max_tokens = VOCAB_SIZE,
    output_mode = 'int'
)
int_layer.adapt(train_ds.map(lambda review, sentiment : review))
int_layer(['It was a terrible movie', "Super!"])

In [None]:
class MeanEmbeddingLayer(tf.keras.layers.Layer):

  def __init__(self, input_dim, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.embedding_layer = tf.keras.layers.Embedding(
        input_dim=self.input_dim,
        output_dim=self.output_dim
    )

  def call(self, ints):
    ## ints should be (batch_size, max_sequence_length)

    #  multiplier.shape -> (ints.shape[0], self.output_dim, 1)
    multiplier = tf.expand_dims(tf.where(ints != 0, 1.0, 0.0), -1)


    # scale_factor.shape -> (ints.shape[0], 1),
    scale_factor = tf.math.sqrt(
        tf.math.count_nonzero(
            ints, axis=-1, keepdims=True, dtype=tf.dtypes.float32)
        )
    # word_embedding.shape -> batch_size, max_seq_length, output_dim)
    word_embeddings = self.embedding_layer(ints)

    # unscaled_sum.shape -> (ints.shape[0], self.output_dim)
    unscaled_sum = tf.reduce_sum(word_embeddings * multiplier, axis=-2)

    return unscaled_sum / scale_factor

  def get_config(self):
        base_config = super().get_config()
        return {**base_config, "input_dim": self.input_dim,
                               "output_dim": self.output_dim}

In [None]:
def get_model_with_embedding(vectorization_layer, output_dim):
  # vectorization layer should be a TextVectorization layer
  # output_dim is the dimensionality of the embedding vectors

  model = tf.keras.Sequential()

  model.add(vectorization_layer)

  model.add(MeanEmbeddingLayer(len(vectorization_layer.get_vocabulary()),
                               output_dim))

  model.add(tf.keras.layers.Dense(units=16, activation='relu',
                                  kernel_initializer="he_uniform"))
  model.add(tf.keras.layers.Dense(units=16, activation='relu',
                                  kernel_initializer="he_uniform"))
  model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  return model


In [None]:
int_layer = tf.keras.layers.TextVectorization(
    max_tokens = VOCAB_SIZE,
    output_mode = 'int'
)
int_layer.adapt(train_ds.map(lambda review, sentiment : review))

In [None]:
model3 = get_model_with_embedding(int_layer, output_dim=16)
model3.summary()

In [None]:
model3.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
history = model3.fit(
    train_ds,
    validation_data = val_ds,
    epochs=100,
    callbacks=[early_stopping],
)

In [None]:
model3.evaluate(val_ds)

In [None]:
model3.evaluate(test_ds)