Team Memebers:
Brinda Rao,
Janusz Feigel,
Bhavana Malla

In [None]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

In [None]:
#prepare the train data
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)
#prepare the test data
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
test_data = test_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0], bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=True), 
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.40771278738975525
Test accuracy: 0.8541200160980225


#Embedding and LSTM Experiments
* Embedding 256, LSTM 128, LSTM 64, 5 epochs: 0.95 train acc, 0.82 test acc
* Embedding 128, LSTM 32, LSTM 32, 5 epochs: 0.93 train acc, 0.86 test acc
* Embedding 128, LSTM 32, LSTM 32, LSTM 32, 5 epochs: 0.97 train acc, 0.86 test acc
* Embedding 256, LSTM 32, LSTM 32, LSTM 32, 5 epochs: 0.90 train acc, 0.86 test acc
* Embedding 128, LSTM 128, LSTM 64, LSTM 32, 5 epochs: 0.92 train acc, 0.86 test acc
* Embedding 128, LSTM 128, LSTM 128, LSTM 128, 5 epochs: 0.93 train acc, 0.86 test acc

#GRU

In [None]:
#prepare the train data
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)
#prepare the test data
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
test_data = test_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0], bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=True), 
                             tf.keras.layers.GRU(32, return_sequences=True),
                             tf.keras.layers.GRU(32, return_sequences=True),
                             tf.keras.layers.GRU(32),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.48423057794570923
Test accuracy: 0.8568800091743469


same speed, better train accuracy, same test accuracy

#Bidirectional

In [None]:
#prepare the train data
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)
#prepare the test data
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
test_data = test_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0], bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=True), 
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.5764022469520569
Test accuracy: 0.8508399724960327


2 times slower, better train accuracy, similar test accuracy

#Without bucketing

In [None]:
#prepare the train data
max_words = 20000
max_len = 200
(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# within-batch padding
train_data = train_data.padded_batch(32)

#prepare the test data
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

# within-batch padding
test_data = test_data.padded_batch(32)

#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=True), 
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.4819253981113434
Test accuracy: 0.8527600169181824


similar results, but 2 times slower

#Without bucketing and within-batch padding

In [None]:
#prepare the train data
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)


#bucketing and within-batch padding
train_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_len)
train_data = tf.data.Dataset.from_tensor_slices((train_sequences_padded, train_labels))
train_data = train_data.shuffle(25000).batch(64)

#prepare the test data
#without bucketing and within-batch padding

test_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_len)
test_data = tf.data.Dataset.from_tensor_slices((test_sequences_padded, test_labels))
test_data = test_data.shuffle(25000).batch(64)
#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=True), 
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.5163354277610779
Test accuracy: 0.8468800187110901


similar results, 8 times slower

#Without Masking

In [None]:
#prepare the train data
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)

def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label

train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)
#prepare the test data
def gen_test():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

test_data = tf.data.Dataset.from_generator(gen_test, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

#bucketing and within-batch padding
test_data = test_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0], bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

#define the model
# embedding, masking and keras LSTM
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 128, mask_zero=False), 
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32, return_sequences=True),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation="sigmoid")])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

#run the model 
history = model.fit(train_data, epochs=5)

test_scores = model.evaluate(test_data)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.6931464672088623
Test accuracy: 0.5


similar speed, far worse results