In [1]:
import tensorflow as tf
import shutil
import random
import os

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
ds = tf.keras.utils.get_file(origin=url, untar=True, extract=True, cache_dir=".")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


# Step 2: Create a val directory

In [3]:
path="./datasets/aclImdb/"

os.makedirs(os.path.join(path, "val/neg"), exist_ok=True)
os.makedirs(os.path.join(path, "val/pos"), exist_ok=True)

In [4]:
test_neg_list = os.listdir(os.path.join(path, "test/neg"))
random.shuffle(test_neg_list)

for file in test_neg_list[:7500]:
  shutil.move(os.path.join(path, f"test/neg/{file}"), os.path.join(path, f"val/neg/{file}"))

In [5]:
test_pos_list = os.listdir(os.path.join(path, "test/pos"))
random.shuffle(test_pos_list)

for file in test_pos_list[:7500]:
  shutil.move(os.path.join(path, f"test/pos/{file}"), os.path.join(path, f"val/pos/{file}"))

# Step 3: Create tf.data.Dataset objects

## Option 1: In memory with from_tensor_slices

In [33]:
def create_dataset_1(base_dir):
  reviews =[]
  sentiments = []
  for sentiment in ["pos", "neg"]:
    dir = os.path.join(base_dir, sentiment)
    for file_ in os.listdir(dir):
      with open (os.path.join(dir, file_), "r") as f:
        reviews.append(f.readlines())
      sentiments.append(1.0 if sentiment == "pos" else 0.0)

  return tf.data.Dataset.from_tensor_slices((reviews, sentiments))

In [7]:
# for X, y in create_dataset_1(os.path.join(path, "test/")).take(3):
#   print(X)
#   print(y)
#   print("*"*50)

tf.Tensor([b'as a \'physically challenged\' person (god, how i hate that phrase) i just happened to catch this on cable where there was absolutely nothing else to watch - overall, it was a fantastic movie. yes, i was a little disappointed upon finding out that neither actor is disabled, and yes, i was a little disappointed that more of the movie wasn\'t filmed from the \'true\' point of view of the disabled (can you imagine what it\'s like always being the tallest person in the room and then having to live the rest of your life with a view of nothing but other people\'s asses and crotches? having to always wait for the idiot to stop reading the newspaper in the only handicapped stall, enduring everyone else\'s rude bodily expulsions while you wait?). and the scene with him driving the car was absolutely me! been there, done that, literally. but the movie was true enough to matter - while i\'ve never lived in a home or assisted residence, there were plenty of times throughout the movie 

## Option 2: use the TextLineDataset

In [8]:
def create_dataset_2(base_dir) :
  pos_file_paths = [os.path.join(base_dir, "pos", f) for f in os.listdir(os.path.join(base_dir, "pos") ) ]
  neg_file_paths = [os.path.join(base_dir, "neg", f) for f in os.listdir(os.path.join(base_dir, "neg") ) ]

  ds_pos = tf.data.TextLineDataset(pos_file_paths).map(lambda x: (x, 1))
  ds_neg = tf.data.TextLineDataset(neg_file_paths).map(lambda x: (x, 0))

  dataset = ds_pos.concatenate(ds_neg)

  return dataset

In [9]:
# for X, y in create_dataset_2(os.path.join(path, "test/")).take(3):
#   print(X)
#   print(y)
#   print("*"*50)

tf.Tensor(b'as a \'physically challenged\' person (god, how i hate that phrase) i just happened to catch this on cable where there was absolutely nothing else to watch - overall, it was a fantastic movie. yes, i was a little disappointed upon finding out that neither actor is disabled, and yes, i was a little disappointed that more of the movie wasn\'t filmed from the \'true\' point of view of the disabled (can you imagine what it\'s like always being the tallest person in the room and then having to live the rest of your life with a view of nothing but other people\'s asses and crotches? having to always wait for the idiot to stop reading the newspaper in the only handicapped stall, enduring everyone else\'s rude bodily expulsions while you wait?). and the scene with him driving the car was absolutely me! been there, done that, literally. but the movie was true enough to matter - while i\'ve never lived in a home or assisted residence, there were plenty of times throughout the movie w

In [10]:
# %timeit -r1 for X, y in create_dataset_1("./datasets/aclImdb/train").repeat(10):pass

1min 1s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
# %timeit -r1 for X, y in create_dataset_2("./datasets/aclImdb/train").repeat(10):pass

1min 30s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [42]:
BATCH = 512

train_ds = create_dataset_2("./datasets/aclImdb/train").shuffle(15_000, seed=42).batch(BATCH).prefetch(1)
val_ds = create_dataset_2("./datasets/aclImdb/val").batch(BATCH).prefetch(1)
test_ds = create_dataset_2("./datasets/aclImdb/test").batch(BATCH).prefetch(1)

In [27]:
!ls ./datasets/aclImdb/train

labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt


# Step 4: Create and train model with Multi-Hot-Encoding

In [43]:
multi_hot_layer = tf.keras.layers.TextVectorization(max_tokens=10_000, output_mode="multi_hot")
multi_hot_layer.adapt(train_ds.map(lambda review, _ : review))

In [44]:
print(multi_hot_layer.get_vocabulary()[:20])

['[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but', 'film']


In [45]:
def get_model(conversion_layer):
    model = tf.keras.Sequential()

    model.add(conversion_layer)
    vocab_size = len(conversion_layer.get_vocabulary())

    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu",
        kernel_initializer="he_uniform",
        input_shape=(vocab_size, )
    ))

    model.add(tf.keras.layers.Dense(
        units=16,
        activation="relu",
        kernel_initializer="he_uniform",
    ))

    model.add(tf.keras.layers.Dense(
        units=1,
        activation="sigmoid",
    ))

    return model

In [46]:
model = get_model(multi_hot_layer)

In [47]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 10000)             0         
 ctorization)                                                    
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160305 (626.19 KB)
Trainable params: 160305 (626.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [48]:
model.compile(
    loss="binary_crossentropy",
    optimizer="rmsprop",
    metrics=["accuracy"]
)

In [49]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=3,
    restore_best_weights=True
)

history = model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=[early_stopping_cb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [50]:
model.evaluate(val_ds)



[0.29220980405807495, 0.881933331489563]

# Step 5: Create and Train a Model with TF-IDF Encoding

In [51]:
tf_idf = tf.keras.layers.TextVectorization(max_tokens=10_000, output_mode="tf_idf")
tf_idf.adapt(train_ds.map(lambda review, _ : review))

model = get_model(tf_idf)

model.compile(
    loss="binary_crossentropy",
    optimizer="rmsprop",
    metrics=["accuracy"]
)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.001,
    patience=3,
    restore_best_weights=True
)

history = model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=[early_stopping_cb])

model.evaluate(val_ds)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


[0.329736590385437, 0.8730000257492065]

# Step 6: Create a Custom Embedding Layer