In [111]:
import tensorflow as tf
import os
from pathlib import Path

In [112]:
train_path = Path() / 'aclImdb' / 'train'
test_path = Path() / 'aclImdb' / 'test'

In [113]:
unsupervised_path = train_path / 'unsup'
if os.path.exists(unsupervised_path):
    for file in os.listdir(unsupervised_path):
        file_path = unsupervised_path / file
        os.remove(file_path)
    os.rmdir(unsupervised_path)
    print('Unsupervised dataset removed')
else:
    print('Unsupervised dataset does not exist')    

Unsupervised dataset does not exist


In [114]:
train_ds = tf.keras.utils.text_dataset_from_directory(
    train_path,
    batch_size=32,
    seed=42,
)

Found 25000 files belonging to 2 classes.


In [115]:
for i, label in enumerate(train_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to neg
Label 1 corresponds to pos


In [116]:
(test_ds, val_ds) = tf.keras.utils.text_dataset_from_directory(
    test_path,
    batch_size=32,
    validation_split=0.6,
    subset='both',
    seed=42,
)

Found 25000 files belonging to 2 classes.
Using 10000 files for training.
Using 15000 files for validation.


In [117]:
for i, label in enumerate(test_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to neg
Label 1 corresponds to pos


In [118]:
vocab_size = 10_000
embedding_dim = 16

In [122]:
text_ds = train_ds.map(lambda x, y: x)

In [124]:
def split_text(text):
    return tf.strings.split(text)

In [125]:
num_words = text_ds.map(split_text).reduce(0, lambda x, y: x + tf.size(y)).numpy()

In [126]:
num_words

5844464

In [119]:
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [128]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(), # this computes the mean embedding for each review :)
    tf.keras.layers.Lambda(lambda x: x * tf.sqrt(tf.cast(num_words, tf.float32))), # multiplies mean embedding the sqrt(num_words)
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [129]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [130]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x249b5bcfd30>

In [1]:
import tensorflow_datasets as tfds

In [4]:
imdb_reviews = tfds.load('imdb_reviews')

In [15]:
ds_train_new = imdb_reviews['train']

In [16]:
ds_test_new = imdb_reviews['test']

In [21]:
len(ds_train_new)

25000

In [22]:
len(ds_test_new)

25000

In [33]:
for x in ds_train_new:
    print(x)
    raise Exception('STOP')

{'label': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'text': <tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">}


Exception: STOP