In [58]:
import tensorflow as tf
import os
from pathlib import Path

In [112]:
train_path = Path() / 'aclImdb' / 'train'
test_path = Path() / 'aclImdb' / 'test'

In [113]:
unsupervised_path = train_path / 'unsup'
if os.path.exists(unsupervised_path):
    for file in os.listdir(unsupervised_path):
        file_path = unsupervised_path / file
        os.remove(file_path)
    os.rmdir(unsupervised_path)
    print('Unsupervised dataset removed')
else:
    print('Unsupervised dataset does not exist')    

Unsupervised dataset does not exist


In [114]:
train_ds = tf.keras.utils.text_dataset_from_directory(
    train_path,
    batch_size=32,
    seed=42,
)

Found 25000 files belonging to 2 classes.


In [115]:
for i, label in enumerate(train_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to neg
Label 1 corresponds to pos


In [116]:
(test_ds, val_ds) = tf.keras.utils.text_dataset_from_directory(
    test_path,
    batch_size=32,
    validation_split=0.6,
    subset='both',
    seed=42,
)

Found 25000 files belonging to 2 classes.
Using 10000 files for training.
Using 15000 files for validation.


In [117]:
for i, label in enumerate(test_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to neg
Label 1 corresponds to pos


In [62]:
vocab_size = 10_000
embedding_dim = 16

In [122]:
text_ds = train_ds.map(lambda x, y: x)

In [56]:
def split_text(text):
    return tf.strings.split(text)

In [125]:
num_words = text_ds.map(split_text).reduce(0, lambda x, y: x + tf.size(y)).numpy()

In [126]:
num_words

5844464

In [119]:
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [68]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(), # this computes the mean embedding for each review :)
    tf.keras.layers.Lambda(lambda x: x * tf.sqrt(tf.cast(num_words, tf.float32))), # multiplies mean embedding the sqrt(num_words)
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [69]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [70]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15)

NameError: name 'train_ds' is not defined

In [137]:
import tensorflow_datasets as tfds

In [138]:
imdb_reviews = tfds.load('imdb_reviews', as_supervised=True)

In [139]:
imdb_reviews

{'train': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'test': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'unsupervised': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [140]:
ds_train_new = imdb_reviews['train']

In [141]:
len(ds_train_new)

25000

In [142]:
ds_test_new = imdb_reviews['test']

In [143]:
ds_val_new = ds_test_new.take(20_000)
ds_test_new = ds_test_new.skip(20_000)

In [144]:
text_ds = ds_train_new.map(lambda x, y: x)

In [145]:
num_words = text_ds.map(split_text).reduce(0, lambda x, y: x + tf.size(y)).numpy()

In [146]:
num_words

5844464

In [147]:
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_ds = ds_train_new.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [148]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Reshape((-1, embedding_dim)),
    tf.keras.layers.GlobalAveragePooling1D(), # this computes the mean embedding for each review :)
    tf.keras.layers.Lambda(lambda x: x * tf.sqrt(tf.cast(num_words, tf.float32))), # multiplies mean embedding the sqrt(num_words)
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [149]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [150]:
ds_test_new = ds_test_new.map(lambda x, y: (x, tf.expand_dims(y, -1)))
ds_train_new = ds_train_new.map(lambda x, y: (x, tf.expand_dims(y, -1)))
ds_val_new = ds_val_new.map(lambda x, y: (x, tf.expand_dims(y, -1)))

In [None]:
model.fit(
    ds_train_new,
    validation_data=ds_val_new,
    epochs=15
)

Epoch 1/15