# Loading and Preprocessing Data with TensorFlow

### The tf.data API

In [13]:
import tensorflow as tf

# X = tf.range(10)
# dataset = tf.data.Dataset.from_tensor_slices(X)
# for item in dataset:
#     print(item)

# X_nested = {'a':([1,2,3],[4,5,6]), 'b':[7,8,9]}
# dataset = tf.data.Dataset.from_tensor_slices(X_nested)
# for item in dataset:
#     print(item)

# dataset = tf.data.Dataset.range(10)
# dataset = dataset.repeat(3).batch(7)
# for item in dataset:
#     print(item)

# dataset = dataset.map(lambda x: x * 2)
# dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)
# for item in dataset.take(2):
#     print(item)

# dataset = tf.data.Dataset.range(10).repeat(2)
# dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)
# for item in dataset:
#     print(item)

dataset = tf.data.Dataset.range(10)
dataset = dataset.shuffle(buffer_size=4, seed=42, reshuffle_each_iteration=False).repeat(2).batch(7)
for item in dataset:
    print(item)


tf.Tensor([0 1 3 4 5 2 6], shape=(7,), dtype=int64)
tf.Tensor([9 7 8 0 1 3 4], shape=(7,), dtype=int64)
tf.Tensor([5 2 6 9 7 8], shape=(6,), dtype=int64)


### Reading data from multiple filepaths

In [None]:
import tensorflow as tf

n_inputs = 8
x_mean = -1
x_std = -1

def parse_csv_line(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fielfs = tf.io.decode_csv(line, record_defaults=defs)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])

def preprocess(line):
    x, y = parse_csv_line(line)
    return (x - x_mean) / x_std, y

def csv_reader_dataset(
    filepaths,
    n_readers=5, 
    n_read_threads=None, 
    shuffle_buffer_size=10_000, 
    n_parse_threads=5,
    seed=42,
    batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size).prefetch(1)

### The TFRecord Format

In [1]:
import tensorflow as tf

with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second")
filePaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset.list_files(filePaths)
dataset = dataset.interleave(
    lambda filepath: tf.data.TFRecordDataset(filepath),
    cycle_length=5,
    num_parallel_calls=None
)

for item in dataset:
    print(item)

2025-03-19 01:27:32.162354: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-19 01:27:32.192124: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-19 01:27:32.192242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-19 01:27:32.193620: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 01:27:32.199832: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-19 01:27:32.200832: I tensorflow/core/platform/cpu_feature_guard.cc:1

### TensorFlow Protobufs

In [1]:
import tensorflow as tf
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

# Prepare the Example protocol buffer
person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[
                b"a@b.com",
                b"c@d.com"
            ]))
        }
    )
)

# Write the Example to the TFRecord file
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    for _ in range(5):
        f.write(person_example.SerializeToString())

# Read the TFRecord file
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string),
    "id": tf.io.FixedLenFeature([], tf.int64),
    "emails": tf.io.VarLenFeature(tf.string),
}

def parse(serialized_example):
    return tf.io.parse_single_example(serialized_example, feature_description)

# dataset = tf.data.TFRecordDataset("my_contacts.tfrecord").map(parse)

# or you can batch process the dataset
dataset = tf.data.TFRecordDataset("my_contacts.tfrecord").batch(2).map(parse)
for item in dataset:
    print(item)


2025-03-19 02:54:47.830614: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-19 02:54:47.923833: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-19 02:54:47.923952: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-19 02:54:47.964756: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-19 02:54:47.973395: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-19 02:54:47.977503: I tensorflow/core/platform/cpu_feature_guard.cc:1

### Keras Preprocessing Layers

In [33]:
import tensorflow as tf
import numpy as np

# age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
# discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])
# age_discretized = discretize_layer(age)
# age_discretized

# age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
# discretize_layer = tf.keras.layers.Discretization(num_bins=3)
# discretize_layer.adapt(age)
# age_discretized = discretize_layer(age)
# age_discretized
# onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
# onehot_layer(age_discretized)
# two_age_categories = np.array([[1,0], [2,2], [2,0]])
# onehot_layer(two_age_categories)
# onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3 + 3)
# onehot_layer(two_age_categories + [0, 3])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup()
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

# cities = ["Auckland", "Paris", "Paris", "San Francisco"]
# str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=5)
# str_lookup_layer.adapt(cities)
# str_lookup_layer(cities)
# str_lookup_layer([["Paris"], ["Auckland"], ["Foo"], ["Bar"], ["Baz"]])

# hashing_layer = tf.keras.layers.Hashing(num_bins=10)
# hashing_layer([["Paris"], ["Tokyo"], ["Auckland"], ["Montreal"]])

# tf.random.set_seed(42)
# embedding_layer = tf.keras.layers.Embedding(input_dim=5, output_dim=2)
# embedding_layer(tf.constant([2,4,2]))

# tf.random.set_seed(42)
# ocean_prox = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
# str_lookup_layer = tf.keras.layers.StringLookup()
# str_lookup_layer.adapt(ocean_prox)
# lookup_and_embed = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(1,), dtype=tf.string),  # Specify string input
#     str_lookup_layer,
#     tf.keras.layers.Embedding(input_dim=str_lookup_layer.vocabulary_size(), output_dim=2),
# ])
# lookup_and_embed(tf.constant([["<1H OCEAN"], ["ISLAND"], ["<1H OCEAN"]]))

# train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."] 
# text_vec_layer = tf.keras.layers.TextVectorization()
# text_vec_layer.adapt(train_data)
# text_vec_layer(["Be good!", "Question: be or be?"])

# train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."] 
# text_vec_layer = tf.keras.layers.TextVectorization(output_mode="tf_idf")
# text_vec_layer.adapt(train_data)
# text_vec_layer(["Be good!", "Question: be or be?"])



tf.Tensor(
[[0.         0.         0.         0.         0.         0.91629076
  0.91629076 0.         0.91629076 0.         0.91629076 0.91629076
  0.         0.         0.         0.         0.        ]
 [0.         0.6931472  0.6931472  0.6931472  0.         0.
  0.         0.91629076 0.         0.91629076 0.         0.
  0.91629076 0.91629076 0.         0.         0.91629076]
 [0.         0.6931472  0.6931472  0.6931472  0.91629076 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.91629076 0.91629076 0.        ]], shape=(3, 17), dtype=float32)


### Using Pretrained Language Model Components for Text Embedding

In [39]:
!pip install --upgrade pip
!pip install tensorflow_hub
import tensorflow_hub as hub

hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2")
sentence_embeddings = hub_layer(tf.constant(["To be", "Not to be"]))
sentence_embeddings.numpy().round(2)

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-25.0.1


array([[-0.25,  0.28,  0.01,  0.1 ,  0.14,  0.16,  0.25,  0.02,  0.07,
         0.13, -0.19,  0.06, -0.04, -0.07,  0.  , -0.08, -0.14, -0.16,
         0.02, -0.24,  0.16, -0.16, -0.03,  0.03, -0.14,  0.03, -0.09,
        -0.04, -0.14, -0.19,  0.07,  0.15,  0.18, -0.23, -0.07, -0.08,
         0.01, -0.01,  0.09,  0.14, -0.03,  0.03,  0.08,  0.1 , -0.01,
        -0.03, -0.07, -0.1 ,  0.05,  0.31],
       [-0.2 ,  0.2 , -0.08,  0.02,  0.19,  0.05,  0.22, -0.09,  0.02,
         0.19, -0.02, -0.14, -0.2 , -0.04,  0.01, -0.07, -0.22, -0.1 ,
         0.16, -0.44,  0.31, -0.1 ,  0.23,  0.15, -0.05,  0.15, -0.13,
        -0.04, -0.08, -0.16, -0.1 ,  0.13,  0.13, -0.18, -0.04,  0.03,
        -0.1 , -0.07,  0.07,  0.03, -0.08,  0.02,  0.05,  0.07, -0.14,
        -0.1 , -0.18, -0.13, -0.04,  0.15]], dtype=float32)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=968d3c27-50e7-4d42-bdd9-442f6904c1c2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>