In [1]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers

tf.random.set_seed(42)
np.random.seed(42)

In [2]:
# load IMDB reviews
(train_data, test_data), info = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True,
)

#validation split from train
train_data = train_data.shuffle(25000, seed=42)
val_size = 5000
val_data = train_data.take(val_size)
train_data = train_data.skip(val_size)

print(info)
print("Train batches example:", next(iter(train_data.take(1))))



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.RB70WO_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.RB70WO_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.RB70WO_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
   

In [3]:
# Parameters
max_tokens = 20000   # vocabulary size
max_len = 200        # max sequence length

vectorize_layer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_len,
)


train_text = train_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text.batch(256))

In [4]:
batch_size = 64

def prepare(ds, shuffle=False, batch_size=batch_size):
    ds = ds.map(lambda x, y: (vectorize_layer(x), y),
                num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(10000)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = prepare(train_data, shuffle=True)
val_ds = prepare(val_data, shuffle=False)
test_ds = prepare(test_data, shuffle=False)

In [5]:
embedding_dim = 128

model = keras.Sequential([
    layers.Input(shape=(None,), dtype="int64"),
    layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim, mask_zero=True),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid")
])

model.summary()

In [6]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=["accuracy"]
)

In [12]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    verbose=1
)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 47ms/step - accuracy: 0.9834 - loss: 0.0623 - val_accuracy: 0.9898 - val_loss: 0.0451
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 49ms/step - accuracy: 0.9853 - loss: 0.0538 - val_accuracy: 0.9926 - val_loss: 0.0333
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 50ms/step - accuracy: 0.9919 - loss: 0.0347 - val_accuracy: 0.9944 - val_loss: 0.0229
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 49ms/step - accuracy: 0.9912 - loss: 0.0322 - val_accuracy: 0.9950 - val_loss: 0.0168
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 49ms/step - accuracy: 0.9940 - loss: 0.0234 - val_accuracy: 0.9968 - val_loss: 0.0116
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 46ms/step - accuracy: 0.9943 - loss: 0.0203 - val_accuracy: 0.9960 - val_loss: 0.0127
Epoch 7/10
[1m3

In [13]:
results = model.evaluate(test_ds, verbose=2)
print("Test loss: {:.4f}, Test accuracy: {:.4f}".format(*results))

391/391 - 6s - 16ms/step - accuracy: 0.8198 - loss: 1.1244
Test loss: 1.1244, Test accuracy: 0.8198


In [14]:
model.save("text_classification_model.keras")
print("Saved model to ./text_classification_model.keras")


Saved model to ./text_classification_model.keras
