# Loading and Preprocessing Data with TensorFlow

### The tf.data API

In [1]:
import tensorflow as tf

# X = tf.range(10)
# dataset = tf.data.Dataset.from_tensor_slices(X)
# for item in dataset:
#     print(item)

# X_nested = {'a':([1,2,3],[4,5,6]), 'b':[7,8,9]}
# dataset = tf.data.Dataset.from_tensor_slices(X_nested)
# for item in dataset:
#     print(item)

# dataset = tf.data.Dataset.range(10)
# dataset = dataset.repeat(3).batch(7)
# for item in dataset:
#     print(item)

# dataset = dataset.map(lambda x: x * 2)
# dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)
# for item in dataset.take(2):
#     print(item)

# dataset = tf.data.Dataset.range(10).repeat(2)
# dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)
# for item in dataset:
#     print(item)

dataset = tf.data.Dataset.range(10)
dataset = dataset.shuffle(buffer_size=4, seed=42, reshuffle_each_iteration=False).repeat(2).batch(7)
for item in dataset:
    print(item)


2025-03-20 14:24:29.504849: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-20 14:24:29.535800: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-20 14:24:29.535933: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-20 14:24:29.537101: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 14:24:29.542987: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-20 14:24:29.544020: I tensorflow/core/platform/cpu_feature_guard.cc:1

### Reading data from multiple filepaths

In [2]:
import tensorflow as tf

n_inputs = 8
x_mean = -1
x_std = -1

def parse_csv_line(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fielfs = tf.io.decode_csv(line, record_defaults=defs)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])

def preprocess(line):
    x, y = parse_csv_line(line)
    return (x - x_mean) / x_std, y

def csv_reader_dataset(
    filepaths,
    n_readers=5, 
    n_read_threads=None, 
    shuffle_buffer_size=10_000, 
    n_parse_threads=5,
    seed=42,
    batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size).prefetch(1)

### The TFRecord Format

In [3]:
import tensorflow as tf

with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second")
filePaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset.list_files(filePaths)
dataset = dataset.interleave(
    lambda filepath: tf.data.TFRecordDataset(filepath),
    cycle_length=5,
    num_parallel_calls=None
)

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second', shape=(), dtype=string)


### TensorFlow Protobufs

In [10]:
import tensorflow as tf
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

# Prepare the Example protocol buffer
person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[
                b"a@b.com",
                b"c@d.com"
            ]))
        }
    )
)

# Write the Example to the TFRecord file
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    for _ in range(5):
        f.write(person_example.SerializeToString())

# Read the TFRecord file
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string),
    "id": tf.io.FixedLenFeature([], tf.int64),
    "emails": tf.io.VarLenFeature(tf.string),
}

def parse(serialized_example):
    return tf.io.parse_single_example(serialized_example, feature_description)

# dataset = tf.data.TFRecordDataset("my_contacts.tfrecord").map(parse)

# or you can batch process the dataset
dataset = tf.data.TFRecordDataset("my_contacts.tfrecord").batch(2).map(parse)
for item in dataset:
    print(item)


ValueError: in user code:

    File "/tmp/ipykernel_137/2837857720.py", line 32, in parse  *
        return tf.io.parse_single_example(serialized_example, feature_description)

    ValueError: Input serialized must be a scalar


### Keras Preprocessing Layers

In [33]:
import tensorflow as tf
import numpy as np

# age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
# discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])
# age_discretized = discretize_layer(age)
# age_discretized

# age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
# discretize_layer = tf.keras.layers.Discretization(num_bins=3)
# discretize_layer.adapt(age)
# age_discretized = discretize_layer(age)
# age_discretized
# onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
# onehot_layer(age_discretized)
# two_age_categories = np.array([[1,0], [2,2], [2,0]])
# onehot_layer(two_age_categories)
# onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3 + 3)
# onehot_layer(two_age_categories + [0, 3])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup()
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=5)
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Foo"], ["Bar"], ["Baz"]])

# hashing_layer = tf.keras.layers.Hashing(num_bins=10)
# hashing_layer([["Paris"], ["Tokyo"], ["Auckland"], ["Montreal"]])

# tf.random.set_seed(42)
# embedding_layer = tf.keras.layers.Embedding(input_dim=5, output_dim=2)
# embedding_layer(tf.constant([2,4,2]))

# tf.random.set_seed(42)
# ocean_prox = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
# str_lookup_layer = tf.keras.layers.StringLookup()
# str_lookup_layer.adapt(ocean_prox)
# lookup_and_embed = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(1,), dtype=tf.string),  # Specify string input
#     str_lookup_layer,
#     tf.keras.layers.Embedding(input_dim=str_lookup_layer.vocabulary_size(), output_dim=2),
# ])
# lookup_and_embed(tf.constant([["<1H OCEAN"], ["ISLAND"], ["<1H OCEAN"]]))

# train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."] 
# text_vec_layer = tf.keras.layers.TextVectorization()
# text_vec_layer.adapt(train_data)
# text_vec_layer(["Be good!", "Question: be or be?"])

# train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."] 
# text_vec_layer = tf.keras.layers.TextVectorization(output_mode="tf_idf")
# text_vec_layer.adapt(train_data)
# text_vec_layer(["Be good!", "Question: be or be?"])



tf.Tensor(
[[0.         0.         0.         0.         0.         0.91629076
  0.91629076 0.         0.91629076 0.         0.91629076 0.91629076
  0.         0.         0.         0.         0.        ]
 [0.         0.6931472  0.6931472  0.6931472  0.         0.
  0.         0.91629076 0.         0.91629076 0.         0.
  0.91629076 0.91629076 0.         0.         0.91629076]
 [0.         0.6931472  0.6931472  0.6931472  0.91629076 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.91629076 0.91629076 0.        ]], shape=(3, 17), dtype=float32)


### Using Pretrained Language Model Components for Text Embedding

In [39]:
!pip install --upgrade pip
!pip install tensorflow_hub
import tensorflow_hub as hub

hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
sentence_embeddings = hub_layer(tf.constant(["To be", "Not to be"]))
sentence_embeddings.numpy().round(2)

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-25.0.1


array([[-0.25,  0.28,  0.01,  0.1 ,  0.14,  0.16,  0.25,  0.02,  0.07,
         0.13, -0.19,  0.06, -0.04, -0.07,  0.  , -0.08, -0.14, -0.16,
         0.02, -0.24,  0.16, -0.16, -0.03,  0.03, -0.14,  0.03, -0.09,
        -0.04, -0.14, -0.19,  0.07,  0.15,  0.18, -0.23, -0.07, -0.08,
         0.01, -0.01,  0.09,  0.14, -0.03,  0.03,  0.08,  0.1 , -0.01,
        -0.03, -0.07, -0.1 ,  0.05,  0.31],
       [-0.2 ,  0.2 , -0.08,  0.02,  0.19,  0.05,  0.22, -0.09,  0.02,
         0.19, -0.02, -0.14, -0.2 , -0.04,  0.01, -0.07, -0.22, -0.1 ,
         0.16, -0.44,  0.31, -0.1 ,  0.23,  0.15, -0.05,  0.15, -0.13,
        -0.04, -0.08, -0.16, -0.1 ,  0.13,  0.13, -0.18, -0.04,  0.03,
        -0.1 , -0.07,  0.07,  0.03, -0.08,  0.02,  0.05,  0.07, -0.14,
        -0.1 , -0.18, -0.13, -0.04,  0.15]], dtype=float32)

### The TensorFlow Datasets Project

In [7]:
!pip install --upgrade pip
!pip install tensorflow_datasets
import tensorflow as tf
import tensorflow_datasets as tfds

train_set, valid_set, test_set = tfds.load(
    name="mnist",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True,
)

train_set = train_set.shuffle(buffer_size=10_000, seed=42).batch(32).prefetch(2)
valid_set = valid_set.batch(32).cache()
test_set = test_set.batch(32).cache()
tf.random.set_seed(42)

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(10, activation="softmax"),
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5, validation_data=valid_set)

test_loss, test_accuracy = model.evaluate(test_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Process MNIST dataset

In [44]:
!pip install --upgrade pip
!pip install tensorflow_datasets
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example
from contextlib import ExitStack
import numpy as np

(X_train_full, y_train_full), (X_test_full, y_test_full) = tf.keras.datasets.fashion_mnist.load_data()
X_valid, X_train = X_train_full[:1000], X_train_full[-1000:]
y_valid, y_train = y_train_full[:1000], y_train_full[-1000:]
X_test = X_test_full[-1000:]
y_test = y_test_full[-1000:]

tf.random.set_seed(42)
train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_set = train_set.shuffle(len(X_train), seed=42)
valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))
test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))

def create_examples(dataset):
    res = []
    for image, label in dataset:
        # Serialize the image tensor properly as bytes
        image_example = Example(
            features=Features(
                feature={
                    "label": Feature(int64_list=Int64List(value=[label.numpy()])),  # Ensure label is converted to int
                    "image": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(image).numpy()])),
                }
            )
        )
        res.append(image_example)
    return res

def write_tfrecords(name, examples, n_shards):
    paths = [
        f"{name}_{str(i).zfill(3)}.tfrecord"
        for i in range(n_shards)
    ]
    with ExitStack() as stack:
        writers = [
            stack.enter_context(tf.io.TFRecordWriter(path))
            for path in paths
        ]
        for index, example in enumerate(examples):
            shard = index % n_shards
            writers[shard].write(example.SerializeToString())
    return paths

train_paths = write_tfrecords("image_data_train", create_examples(train_set), 2)
valid_paths = write_tfrecords("image_data_valid", create_examples(valid_set), 2)
test_paths = write_tfrecords("image_data_test", create_examples(test_set), 2)

def preprocess(serialized_example):
    feature_description = {
        "label": tf.io.FixedLenFeature([], tf.int64),
        "image": tf.io.FixedLenFeature([], tf.string),
    }
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)
    image = tf.io.parse_tensor(parsed_example['image'], out_type=tf.uint8)
    image = tf.reshape(image, [28, 28])
    return image, parsed_example['label']

def create_dataset(
    paths,
    n_read_threads=5,
    shuffle_buffer_size=None,
    n_parse_threads=5,
    batch_size=2,
    cache=True
):

    dataset = tf.data.TFRecordDataset(paths, num_parallel_reads=n_read_threads)
    if cache:
        dataset = dataset.cache()
    if shuffle_buffer_size:
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(1)
    return dataset

train_set = create_dataset(train_paths, shuffle_buffer_size=60000)
valid_set = create_dataset(valid_paths)
test_set = create_dataset(test_paths)

tf.random.set_seed(42)
standardization = tf.keras.layers.Normalization(input_shape=[28, 28])

# Use sample_image_batches with corrected indexing to extract the first element explicitly
sample_image_batches = train_set.take(100).map(lambda image, label: image)  # Ensure proper extraction of images
sample_images = np.concatenate(list(sample_image_batches.as_numpy_iterator()),
                                axis=0).astype(np.float32)
standardization.adapt(sample_images)

model = tf.keras.Sequential([
    standardization,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="nadam", metrics=["accuracy"])
model.fit(train_set, epochs=15, validation_data=valid_set)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7f37bceac220>

### Process IMDB dataset

In [1]:
!pip install --upgrade pip
!pip install tensorflow_datasets
import tensorflow as tf
import tensorflow_datasets as tfds
from pathlib import Path
import numpy as np

# # Step 1: Download the dataset
# root = "https://ai.stanford.edu/~amaas/data/sentiment/"
# filename = "aclImdb_v1.tar.gz"
# filepath = tf.keras.utils.get_file(filename, root + filename, extract=True)

# if "_extracted" in filepath:
#     path = Path(filepath) / "aclImdb"
# else:
#     path = Path(filepath).with_name("aclImdb")

# # Step 2: Prepare the dataset
# def review_paths(dirpath):
#     return [
#         str(path)
#         for path in dirpath.glob("*.txt")
#     ]

# train_pos = review_paths(path / "train" / "pos")
# train_neg = review_paths(path / "train" / "neg")
# test_valid_pos = review_paths(path / "test" / "pos")
# test_valid_neg = review_paths(path / "test" / "neg")

# np.random.shuffle(test_valid_pos)

# test_pos = test_valid_pos[:5000]
# test_neg = test_valid_neg[:5000]
# valid_pos = test_valid_pos[5000:]
# valid_neg = test_valid_neg[5000:]

# # Step 3: Create the dataset
# def imdb_dataset(filepaths_positive, filepaths_negative):
#     reviews = []
#     labels = []
#     for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):
#         for filepath in filepaths:
#             with open(filepath) as review:
#                 reviews.append(review.read())
#                 labels.append(label)
#     return tf.data.Dataset.from_tensor_slices((tf.constant(reviews), tf.constant(labels)))

# # For large datasets that don't fit in memory
# # def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):
# #     dataset_neg = tf.data.TextLineDataset(filepaths_negative, num_parallel_reads=n_read_threads)
# #     dataset_pos = tf.data.TextLineDataset(filepaths_positive, num_parallel_reads=n_read_threads)
# #     dataset_neg = dataset_neg.map(lambda x: (x, 0))
# #     dataset_pos = dataset_pos.map(lambda x: (x, 1))
# #     return dataset_neg.concatenate(dataset_pos, dataset_neg)

# tf.random.set_seed(42)
# batch_size = 32
# train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)
# valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
# test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

# # Step 4: Create and Train the model
max_tokens = 1000
# sample_review = train_set.map(lambda text, label: text)
text_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode="tf_idf")
# text_vectorization.adapt(sample_review)
# text_vectorization.get_vocabulary()[:10]

# model = tf.keras.Sequential([
#     text_vectorization,
#     tf.keras.layers.Dense(100, activation="relu"),
#     tf.keras.layers.Dense(1, activation="sigmoid")
# ])
# model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
# model.fit(train_set, epochs=5, validation_data=valid_set)

# Step 5: Compute mean embedding
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

datasets = tfds.load(name="imdb_reviews")
train_set = datasets["train"]
valid_set = datasets["test"]
train_set = train_set.map(lambda x: (x["text"], x["label"]))
valid_set = valid_set.map(lambda x: (x["text"], x["label"]))
embedding_size = 20
batch_size = 32
text_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode="int")
sample_review = train_set.map(lambda text, label: text)
text_vectorization.adapt(sample_review)
model = tf.keras.Sequential([
    text_vectorization,
    tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=embedding_size, mask_zero=True),
    tf.keras.layers.Lambda(compute_mean_embedding),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
train_set = train_set.shuffle(25000).batch(batch_size).prefetch(1)
valid_set = valid_set.batch(batch_size).prefetch(1)
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit(train_set, epochs=5, validation_data=valid_set)

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-25.0.1
Collecting tensorflow_datasets
  Downloading tensorflow_datasets-4.9.8-py3-none-any.whl.metadata (11 kB)
Collecting array_record>=0.5.0 (from tensorflow_datasets)
  Downloading array_record-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (877 bytes)
Collecting dm-tree (from tensorflow_datasets)
  Downloading dm_tree-0.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting etils>=1.6.0 (from etils[edc,enp,epath,epy,etree]>=1.6.0; python_version < "3.11"->tensorflow_datasets)
  Downloading etils-1.12.2-py3-none-any.whl.metadata (6

<keras.src.callbacks.History at 0x7f2c4071d240>

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=968d3c27-50e7-4d42-bdd9-442f6904c1c2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>