Loading and Preprocessing Data with TensorFlow 

We cover Data API, TFRecord format and the Features API in detail 

In [1]:
import sklearn
import tensorflow as tf
from tensorflow import keras 
import numpy as np 


import matplotlib.pyplot as plt
import matplotlib as mpl 
%matplotlib inline
mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)




Datasets 

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X) #OR just dataset = tf.data.Dataset.range(10) 
dataset 

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [3]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


Chaining transformations

 first call the repeat() method on the original dataset, and it
returns a new dataset that will repeat the items of the original dataset 3 times:

In [4]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [5]:
#with map() method you can apply any preprocessing you want to the data.

dataset = dataset.map(lambda x: x*2)

In [6]:
#unbatch() function removes the batch dimension

dataset = dataset.unbatch()

In [7]:
#It is also possible to simply filter the dataset using the filter() method:

dataset = dataset.filter(lambda x: x<10) #returns items<10 only 


In [8]:
#take() method allows you to look at a few items from the dataset

for item in dataset.take(4):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)


Split the California dataset to multiple CSV files 

load, split into a training,validation set and a test set, then scale it 

In [9]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1,1), random_state=42)

X_train,X_valid,y_train,y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_std = scaler.scale_
X_mean = scaler.mean_

For very large datasets, you can split it into many files first, then have TensorFlow read these files in parallel. 

In [10]:
import os

def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing") #./datasets/housing where csv files will be stored
    os.makedirs(housing_dir, exist_ok = True)
    path_format = os.path.join(housing_dir, "my_{}_{}_.csv") #format string for the filenames: my_<name_prefix>_<file_idx>_<part_number>.csv.

    filepaths = []
    m = len(data) #total number of rows in the dataset
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx) #format the path for the current part's csv file 
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [11]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [12]:
import pandas as pd
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


Building an Input Pipeline 

In [13]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [14]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets\\housing\\my_train_14_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_6_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_10_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_7_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_0_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_4_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_19_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_11_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_2_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_9_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_16_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_18_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_3_.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_5_.cs

In [15]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length = n_readers
)

In [16]:
for line in dataset.take(5):
    print(line.numpy())

b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205'
b'3.8456,35.0,5.461346633416459,0.9576059850374065,1154.0,2.8778054862842892,37.96,-122.05,1.598'
b'4.6477,38.0,5.03728813559322,0.911864406779661,745.0,2.5254237288135593,32.64,-117.07,1.504'
b'1.6571,34.0,4.454976303317536,1.0876777251184835,1358.0,3.2180094786729856,37.94,-122.35,1.052'
b'3.9688,41.0,5.259786476868327,0.9715302491103203,916.0,3.2597864768683276,33.98,-118.07,1.698'


the 4th field is read as a string 

In [17]:
record_defaults = [0, np.nan, tf.constant(np.nan), "Hello", tf.constant([])]
parsed_fields = tf.io.decode_csv("1,2,3,4,5", record_defaults)
parsed_fields

[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=3.0>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

All missing fields are replaced with their default value, when provided, take a keen look above 

The decode_csv() function returns a list of scalar tensors

In [18]:
parsed_fields = tf.io.decode_csv(",,,,5", record_defaults)
parsed_fields

[<tf.Tensor: shape=(), dtype=int32, numpy=0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=nan>,
 <tf.Tensor: shape=(), dtype=float32, numpy=nan>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Hello'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

tf.constant([]) as the default value makes the 5th field compulsory otherwise we get an exception 

In [19]:
n_inputs = 8 # 8 input features + the label i.e X_train.shape[-1]

@tf.function
def preprocess(line):
    defs =[0.0] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs) #returns a list of scalar tensors (one per column)
    x = tf.stack(fields[:-1])  #stacks the tensors into a 1D array 
    y = tf.stack(fields[-1:])  #!D array with a single value 
    return (x - X_mean) / X_std, y  #subtract feature means, divide by feature standard deviations. This scales the input features 



In [20]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579157,  1.216324  , -0.05204565, -0.39215982, -0.5277444 ,
        -0.26334876,  0.8543047 , -1.3072057 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

Putting everything into a small helper function that will that creates and returns a dataset which will efficiently load the california housing dataset from multiple CSV files  

In [21]:
def csv_reader_dataset(filepaths, repeat = None, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000, n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length = n_readers,
        num_parallel_calls = n_read_threads
    )
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [22]:
tf.random.set_seed(42)

In [23]:
train_set = csv_reader_dataset(train_filepaths, batch_size=3)

for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)   
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[ 1.1397223   1.8491895   0.37818593 -0.07571168 -0.7568092  -0.29858744
  -0.70573175  0.5919061 ]
 [ 0.60438776  0.42524222  0.34224355 -0.00898078  0.14758591 -0.23538612
   0.8449348  -1.322202  ]
 [-0.13317037 -0.36583957 -0.20266704 -0.08610389  0.24888547 -0.25766754
  -1.3428637   1.2116159 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[3.993]
 [3.951]
 [1.786]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[-0.7923741  -0.36583957 -0.73364234 -0.01817797  0.23793417 -0.5609138
  -0.7197856   0.64688075]
 [ 0.77387625 -0.20762321  0.3036717  -0.36256418 -0.70296526  0.02327382
  -1.3053845   1.1866299 ]
 [ 1.9251521  -1.1569214   0.40059727  0.013611    0.07366463 -0.3017717
  -0.95402515  0.8667794 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[2.667  ]
 [2.607  ]
 [5.00001]], shape=(3, 1), dtype=float32)



Create datasets and use them to train a keras model:-

In [24]:
train_set = csv_reader_dataset(train_filepaths, repeat= None)

valid_set = csv_reader_dataset(train_filepaths) #repeat = None is default

test_set = csv_reader_dataset(train_filepaths)  

In [25]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation = "relu", input_shape = [8]),
    keras.layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
model.compile(loss= 'mse', optimizer = keras.optimizers.SGD(learning_rate = 0.01))

In [27]:
# batch_size = 32
# model.fit(train_set, steps_per_epoch = len(X_train) // batch_size, epochs = 10, validation_data = valid_set)

#commneted out, computation time took too long to run

In [28]:
# model.evaluate(test_set, steps = len(X_test) // batch_size)

TFRecord Format

    - for storing large datasets. It's a simple binary format that just contains a sequence of binary records of varying sizes 

In [29]:
#create one by using tf.io.TFRecordWriter class

with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

In [30]:
#use a tf.data.TFRecordDataset to read one or more TFRecord files

filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset (filepaths)

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [31]:
#you can read multiple tfrecord files simultaneously, by using num_parallel_reads and interleave methods. 

filepaths = ['test_{}.tfrecord'.format(i) for i in range(5)]
for i, filepath in enumerate(filepaths):
    with tf.io.TFRecordWriter(filepath) as f:
        for j in range(3):
            f.write("File {} record {}".format(i, j).encode("utf-8"))

dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)

for item in dataset:
    print(item)

tf.Tensor(b'File 0 record 0', shape=(), dtype=string)
tf.Tensor(b'File 1 record 0', shape=(), dtype=string)
tf.Tensor(b'File 2 record 0', shape=(), dtype=string)
tf.Tensor(b'File 0 record 1', shape=(), dtype=string)
tf.Tensor(b'File 1 record 1', shape=(), dtype=string)
tf.Tensor(b'File 2 record 1', shape=(), dtype=string)
tf.Tensor(b'File 0 record 2', shape=(), dtype=string)
tf.Tensor(b'File 1 record 2', shape=(), dtype=string)
tf.Tensor(b'File 2 record 2', shape=(), dtype=string)
tf.Tensor(b'File 3 record 0', shape=(), dtype=string)
tf.Tensor(b'File 4 record 0', shape=(), dtype=string)
tf.Tensor(b'File 3 record 1', shape=(), dtype=string)
tf.Tensor(b'File 4 record 1', shape=(), dtype=string)
tf.Tensor(b'File 3 record 2', shape=(), dtype=string)
tf.Tensor(b'File 4 record 2', shape=(), dtype=string)


Compressed TFRecord Files 

    -If your files need to be loaded via a network connection, it might be useful to compress them, by setting the options argument:

In [32]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter('compressed.tfrecord', options) as f:
    f.write(b'first record')
    f.write(b'second record')

In [33]:
dataset = tf.data.TFRecordDataset('compressed.tfrecord', compression_type='GZIP')

for item in dataset:
    print(item)

tf.Tensor(b'first record', shape=(), dtype=string)
tf.Tensor(b'second record', shape=(), dtype=string)


Intro to Protocol Buffers  
    -TFRecord files usually contain serialized Protocol Buffers (also called protobufs). This is a portable, extensible and efficient binary format developed at Google.

In [34]:
#write a simple protobuff definition

#syntax = "proto3";
#message Person {
 ###repeated string email = 3;
#}

In [35]:
#compile it 

!protoc person.proto --python_out=. --descriptor_set_out  person.desc --include_imports 

In [36]:
!ls -l person*

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [37]:
from person_pb2 import Person #import the access class
person = Person(name='Al', id = 123, email = ['a@b.com']) #creates a person
print(person)

name: "Al"
id: 123
email: "a@b.com"



In [38]:
#read a fiel 
person.name 

'Al'

In [39]:
#modify a field

person.name = "Alice"

In [40]:
person.email[0]   #access repeated fields, like arrays 

'a@b.com'

In [41]:
#add an email address 

person.email.append('c@d.com')

In [42]:
s = person.SerializeToString() #serialize the object to a byte string
s

b'\n\x05Alice\x10{\x1a\x07a@b.com\x1a\x07c@d.com'

In [43]:
person2 = Person() #create a new person 

person2.ParseFromString(s) #parse the byte string and put it in person2


27

In [44]:
#confirm if equal 
person2 == person

True

 When reading or receiving this binary data taht underwwent serialization, we can parse it using the ParseFromString() method, and we get a copy of the object that was serialized.

 However, SerializeToString() and ParseFromString() are not TF operations so they cannot be included in a TF function 

TensorFlow Protobufs 

    -: The main protobuf used in a TFRecord file is the Example protobuf, which represents one instance in a dataset. It contains a list of named features, where each feature can either be a list of byte strings, a list of floats or a list of integers

In [45]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Example, Features, Feature

person_example = Example(
    features = Features(
        feature = {
            "name": Feature(bytes_list = BytesList(value = [b"Alice"])),
            "id": Feature(int64_list = Int64List(value = [123])),
            "emails": Feature(bytes_list = BytesList(value = [b"a@b.com", b"c@d.com"]))
        }
    )
)



In [46]:
#serilalize the Example to a byte string 

with tf.io.TFRecordWriter('contacts.tfrecord') as f:
    f.write(person_example.SerializeToString())

Loading and Parsing Examples 

In [47]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}
for serialized_example in tf.data.TFRecordDataset(["contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)

The fixed length features are parsed as regular tensors, but the variable length features are parsed as sparse tensors. You can convert a sparse tensor to a dense tensor using tf.sparse.to_dense(), but in this case it is simpler to just access its values

In [48]:
tf.sparse.to_dense(parsed_example['emails'], default_value=b'')  #conversion

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [49]:
parsed_example['emails'].values #accessing instead of converting to dense

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

A bytelist can contain an binary data, including serialized objects. We can use tf.io.encode_jpeg() to encode an image using JPEG format and put it in a bytelist. Later, when reading TFRecord, after parsing the Example, then you can call tf.io.decode_jpeg() or tf.io.decode_image() which can decode BMP,GIF,JPEG, PNG images. 

Instead of parsing examples one by one using tf.io.parse_single_example(), we can parse them batch by batch using tf.io.parse_example()

In [50]:
dataset = tf.data.TFRecordDataset(["contacts.tfrecord"]).batch(10)
for serialized_exampls in dataset:
    parsed_examples = tf.io.parse_example(serialized_example, feature_description)

In [51]:
parsed_examples

{'emails': SparseTensor(indices=tf.Tensor(
 [[0]
  [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)),
 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>,
 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}

Handling Sequential Data(Lists of Lists) Using SequentialExample Protobuf 

A SequenceExample contains a Features object for the contextual data and a FeatureLists object which contains one or more named FeatureList objects (e.g., a FeatureList named "content" and another named "comments")

Building a SequenceExample, serializing it and parsing it is very similar to building, serializing and parsing an Example, but you must use tf.io.parse_single_sequence_example() to parse a single SequenceExample or tf.io.parse_sequence_example() to
parse a batch, and both functions return a tuple containing the context features (as a dictionary) and the feature lists (also as a dictionary)

In [52]:
FeatureList = tf.train.FeatureList
FeatureLists = tf.train.FeatureLists
SequenceExample = tf.train.SequenceExample 

context = Features(feature={
    'author_id': Feature(int64_list=Int64List(value=[123])),
    'title': Feature(bytes_list=BytesList(value=[b'A', b'New', b'Book'])),
    'pub_date': Feature(int64_list=Int64List(value=[1627, 11, 1]))
})

content = [['When', 'shall', 'we', 'three', 'meet', 'again', '!'],
           ['In','thunder', ',','lightning', ',' ,'or','in', 'rain', '?']]

comments = [["When", "the", "hurlyburly", "'s", "done", "."],
            ["When", "the", "battle", "'s", "lost", "and", "won", "."]]

def words_to_feature(words):
    return Feature(bytes_list=BytesList(value=[word.encode('utf-8') for word in words]))

content_features = [words_to_feature(sentence) for sentence in content]
comments_features = [words_to_feature(comment) for comment in comments]

sequence_example = SequenceExample(
    context= context,
    feature_lists=FeatureLists(
        feature_list={
            'content': FeatureList(feature=content_features),
            'comments': FeatureList(feature=comments_features)
        }
    )
)
    


In [53]:
sequence_example 

context {
  feature {
    key: "title"
    value {
      bytes_list {
        value: "A"
        value: "New"
        value: "Book"
      }
    }
  }
  feature {
    key: "pub_date"
    value {
      int64_list {
        value: 1627
        value: 11
        value: 1
      }
    }
  }
  feature {
    key: "author_id"
    value {
      int64_list {
        value: 123
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "content"
    value {
      feature {
        bytes_list {
          value: "When"
          value: "shall"
          value: "we"
          value: "three"
          value: "meet"
          value: "again"
          value: "!"
        }
      }
      feature {
        bytes_list {
          value: "In"
          value: "thunder"
          value: ","
          value: "lightning"
          value: ","
          value: "or"
          value: "in"
          value: "rain"
          value: "?"
        }
      }
    }
  }
  feature_list {
    key: "comments"
    value {
   

In [54]:
serialized_sequence_example =sequence_example.SerializeToString()

In [55]:
context_feature_descriptions = {
    'author_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'title': tf.io.VarLenFeature(tf.string),
    'pub_date': tf.io.FixedLenFeature([3], tf.int64, default_value=[0, 0 ,0]),
}

sequence_feature_descriptions = {
    'content': tf.io.VarLenFeature(tf.string),
    'comments': tf.io.VarLenFeature(tf.string),
}

parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(
    serialized_sequence_example, context_feature_descriptions, sequence_feature_descriptions    
)

In [56]:
parsed_context

{'title': SparseTensor(indices=tf.Tensor(
 [[0]
  [1]
  [2]], shape=(3, 1), dtype=int64), values=tf.Tensor([b'A' b'New' b'Book'], shape=(3,), dtype=string), dense_shape=tf.Tensor([3], shape=(1,), dtype=int64)),
 'author_id': <tf.Tensor: shape=(), dtype=int64, numpy=123>,
 'pub_date': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([1627,   11,    1], dtype=int64)>}

In [57]:
parsed_context['title'].values

<tf.Tensor: shape=(3,), dtype=string, numpy=array([b'A', b'New', b'Book'], dtype=object)>

 If the feature lists contain sequences of varying sizes (as in the example above), you may want to convert them
to ragged tensors using tf.RaggedTensor.from_sparse()

In [58]:
print(tf.RaggedTensor.from_sparse(parsed_feature_lists['content']))

<tf.RaggedTensor [[b'When', b'shall', b'we', b'three', b'meet', b'again', b'!'],
 [b'In', b'thunder', b',', b'lightning', b',', b'or', b'in', b'rain', b'?']]>


The Features API 

Using a variant of California housing dataset :

In [59]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [60]:
fetch_housing_data()

In [61]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


In [62]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [63]:
housing_median_age = tf.feature_column.numeric_column('housing_median_age')

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


In [64]:
#tweak the "housing_median_age" column to define how it should be scaled

age_mean, age_std = X_mean[1], X_std[1] #median age is column in 1 
housing_median_age = tf.feature_column.numeric_column(
    'housing_median_age', normalizer_fn = lambda x: (x-age_mean)/age_std)


In [65]:
#bucketize some numerical features to improve performance -transforming numerical features to categorical 

median_income = tf.feature_column.numeric_column('median_income')
bucketized_income = tf.feature_column.bucketized_column(
    median_income, boundaries = [1.5, 3.,4.5, 6.])


Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


In [66]:
bucketized_income

BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3.0, 4.5, 6.0))

If the median_income feature is equal to, say, 3.2, then the bucketized_income feature will automatically be equal to 2 (i.e., the index of the corresponding income bucket)

Choosing the right boundaries can be somewhat of an art, but one approach is to just use percentiles of the data (e.g., the 10th percentile, the 20th percentile, and so on)

If a feature is multimodal, define a bucket for each mode, placing the boundaries between the peaks 

Categorical Features 

In [67]:
ocean_prox_vocab = ["<1H OCEAN", "INLAND", "ISLAND", "NEAR BAY", "NEAR OCEAN"]
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(
    "ocean_proximity", ocean_prox_vocab)


Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


In [68]:
ocean_proximity 

VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [69]:
#for cat columns with large vocabulary e.g zips, codes etc you can use hash bucket

city_hash = tf.feature_column.categorical_column_with_hash_bucket(
    "city", hash_bucket_size = 1000
)

city_hash

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


HashedCategoricalColumn(key='city', hash_bucket_size=1000, dtype=tf.string)

Crossed Categorical Features 


    -:Used when two (or more) categorical features are more meaningful when used jointly, by creating a crossed column

In [70]:
bucketized_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries = [-1., -0.5, 0. , 0.5, 1.])

age_and_ocean_proximity = tf.feature_column.crossed_column(
    [bucketized_age, ocean_proximity], hash_bucket_size = 100
)

Instructions for updating:
Use `tf.keras.layers.experimental.preprocessing.HashedCrossing` instead for feature crossing when preprocessing data to train a Keras model.


In [71]:
#another use case is to cross latitude and longitude into a siingle categorical feature 
#bucketize he lat and longitude then cross the bucketized features into a location column 

latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries = np.arange(32., 42.,20 - 1).tolist())
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries = np.arange(-125., -114.,20 - 1).tolist())
location = tf.feature_column.crossed_column(
    [bucketized_latitude, bucketized_longitude], 1000)

Encoding Categorical Features Using One-Hot Vectors 

In [72]:
ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


A one-hot vector encoding has the size of the vocabulary length, which is fine if there are just a few possible categories, but if the vocabulary is large, you will end up with too many inputs fed to your neural network: it will have too many weights to learn and it will probably not perform very well.

In particular, this will typically be the case when you use hash buckets. In this case, you should probably encode them using
embeddings instead. 

Encoding Categorical Features Using Embeddings 

    -: An embedding is a trainable dense vector that represents a category, initialized randomly by default. They improve gradually during training.  

In [73]:
ocean_proximity_embed  = tf.feature_column.embedding_column(ocean_proximity, dimension = 2)

#each of the 5 categories will be represented by a 2D vector

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


If the emedding is too large due to large vocabulary;-

1. Try lowering the dimension of the hyperparameter a bit, not too much

2. Reduce the vocab size e.g. by dropping rare words and replace them with a token like "<Uknown>"

3. If you are using hash buckets, try reducing the hash_bucket_size a bit, too much reduction begets you collisions. 

Using Feature Columns for Parsing 

In [74]:
median_house_value = tf.feature_column.numeric_column('median_house_value')

In [75]:
columns = [housing_median_age, median_house_value]
feature_descriptions = tf.feature_column.make_parse_example_spec(columns)
feature_descriptions 

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),
 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}

In [76]:
#create a function tha parses serialized examples using feature descriptions and separates the target column from the input features
def parse_examples(serialized_examples):
    examples = tf.io.parse_example(serialized_examples, feature_descriptions)
    targets = examples.pop("median_house_value") # separate the targets
    return examples, targets

#create a TFRecordDataset that reads batches of serialized examples from a TFRecord file 

batch_size = 32
dataset = tf.data.TFRecordDataset(["my_data_with_features.tfrecords"])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)


Using Feature Columns in the Models 

 module 'tensorflow.keras.layers' has no attribute 'DenseFeatures': truncated in this TF version

In [77]:
#add a keras.layers.DenseFeatures layer as the first layer of the model and pass it the feature columns

# columns_without_target = columns[:-1]
# model = keras.models.Sequential([
#   keras.layers.DenseFeatures(feature_columns=columns_without_target),
#   keras.layers.Dense(1)
# model.compile(loss="mse",
  #             optimizer=keras.optimizers.SGD(learning_rate=1e-3),
#          metrics=["accuracy"])
## model.fit(dataset, steps_per_epoch=len(X_train) // batch_size, epochs=5)

TF Transform 

In [78]:
try:
    import tensorflow_transform as tft

    def preprocess(inputs):  # inputs is a batch of input features
        median_age = inputs["housing_median_age"]
        ocean_proximity = inputs["ocean_proximity"]
        standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))
        ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
        return {
            "standardized_median_age": standardized_age,
            "ocean_proximity_id": ocean_proximity_id
        }
except ImportError:
    print("TF Transform is not installed. Try running: pip3 install -U tensorflow-transform")

TF Transform is not installed. Try running: pip3 install -U tensorflow-transform


TF Transform is not compatible with python version 3.11 by the time of writing this code 

Tensorflow Datasets 

In [79]:
import tensorflow_datasets as tfds

In [80]:
datasets = tfds.load(name="mnist")
mnist_train, mnist_test = datasets['train'], datasets['test']

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\user\tensorflow_datasets\mnist\3.0.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\user\tensorflow_datasets\mnist\incomplete.0WXUAT_3.0.1\mnist-train.tfrecord*...:   0%|     …

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\user\tensorflow_datasets\mnist\incomplete.0WXUAT_3.0.1\mnist-test.tfrecord*...:   0%|      …

[1mDataset mnist downloaded and prepared to C:\Users\user\tensorflow_datasets\mnist\3.0.1. Subsequent calls will reuse this data.[0m


In [81]:
#apply any transformation thereafter and then train your model on the transformed data

datasets = tfds.load(name='mnist')
mnist_train, mnist_test = datasets['train'], datasets['test']   
mnist_train = mnist_train.repeat(5).batch(32)
mnist_train = mnist_train.map(lambda items: (items['image'], items['label'])) #transforms the data to a tuple with 2 elements as keras expects 
mnist_train = mnist_train.prefetch(1)

In [82]:
for images, labels in mnist_train.take(1):
    print(images.shape)
    print(labels.numpy())

(32, 28, 28, 1)
[4 1 0 7 8 1 2 7 1 6 6 4 7 7 3 3 7 9 9 1 0 6 6 9 9 4 8 9 4 7 3 3]


Alternatively, just let the load() function do the above for you by setting supervised as True:

In [85]:
datasets = tfds.load(name="mnist", batch_size = 32, as_supervised = True)
mnist_train = datasets["train"].repeat().prefetch(1)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28,1]),
    keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)),
    keras.layers.Dense (10, activation = 'softmax')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer = keras.optimizers.SGD(learning_rate = 0.01), metrics = ['accuracy'])

model.fit(mnist_train, steps_per_epoch = 60000//32, epochs = 5)




  super().__init__(**kwargs)



Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8051 - loss: 443.9318
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8684 - loss: 247.9619
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8713 - loss: 243.1887
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8769 - loss: 241.5450
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8776 - loss: 236.1479


<keras.src.callbacks.history.History at 0x1f30e5aa710>