In [17]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['train', 'imdb.vocab', 'imdbEr.txt', 'README', 'test']

In [3]:
train_dir = os.path.join(dataset_dir, "train")

os.listdir(train_dir)

['urls_pos.txt',
 'urls_neg.txt',
 'neg',
 'urls_unsup.txt',
 'labeledBow.feat',
 'unsupBow.feat',
 'pos',
 'unsup']

In [4]:
import shutil
# removes = ["neg", "pos","unsup"]
# for rm_dir in removes:
#   remove_dir = os.path.join(train_dir,rm_dir)
#   shutil.rmtree(remove_dir)

remove_dir = os.path.join(train_dir,"unsup")
shutil.rmtree(remove_dir)

In [5]:
batch_size = 1024
seed = 123

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
                                          'aclImdb/train', batch_size=batch_size, validation_split=0.2,
                                          subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/train",
                                                             batch_size=batch_size,
                                                             validation_split=0.2,
                                                             subset="validation",
                                                             seed=seed)
train_ds, val_ds

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


(<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>,
 <BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>)

In [6]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

In [7]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [8]:
# Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [33]:
# Create a custom standardization function to strip HTML break tags '<br />'
def custom_standardization(input_data):
  text = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(text, "<br />", " ")
  return stripped_html
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
text_vectorizer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

test_ds = train_ds.map(lambda x,y: x)
# text_vectorizer.adapt(test_ds)

In [34]:
from tensorflow.keras.layers import Embedding
text_embedding = Embedding(input_dim=vocab_size,
                           output_dim=16,
                           input_length=sequence_length)

## Dense Model

In [35]:
from tensorflow.keras import layers

In [36]:
# model_1 = tf.keras.Sequential([
#   text_vectorizer,
#   text_embedding,
#   layers.GlobalMaxPool1D(),
#   layers.Dense(16, activation="relu"),
#   layers.Dense(1, activation="sigmoid")
# ])

inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")

x = text_vectorizer(inputs)
x = text_embedding(x)

x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(16, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_word_embedding")

model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

# fit the model
model_1_history = model_1.fit(train_ds,
                              validation_data=val_ds,
                              epochs=20)

Epoch 1/20


FailedPreconditionError: ignored

In [31]:
test_ds = tf.keras.preprocessing.text_dataset_from_directory("aclImdb/test",
                                                             seed=seed)
test_ds = test_ds.cache().prefetch(tf.data.AUTOTUNE)
test_ds

Found 25000 files belonging to 2 classes.


<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [21]:
model_1.evaluate(test_ds)



[0.0, 0.0]

In [22]:
model_1.save("test.h5")

NotImplementedError: ignored

In [None]:
loaded_model = tf.keras.models.load_model("test.h5")

In [None]:
loaded_model.evaluate(test_ds)

## Model 2

In [23]:
import tensorflow_hub as hub
# Create a Keras layer using USE pretrained layer from TensorFlow hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE_layer")

In [25]:
import tensorflow as tf
from tensorflow.keras import layers

# create model using Sequential API
model_2 = tf.keras.Sequential([
  sentence_encoder_layer,
  layers.Dense(64, activation="relu", name="extra_dense_layer"), # only model_2 has this layer
  layers.Dense(1, activation="sigmoid")                               
], name="model_2_USE")

# Compile the model
model_2.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

# fit the model
model_2_history = model_2.fit(train_ds,
                              validation_data=val_ds,
                              epochs=2)

Epoch 1/2
Epoch 2/2


In [27]:
model_2.save("model_2.h5")

In [29]:
loaded_model = tf.keras.models.load_model("model_2.h5",
                                           custom_objects={"KerasLayer":hub.KerasLayer})

In [32]:
loaded_model.evaluate(test_ds)



[0.5166080594062805, 0.8110399842262268]