# Section 1: Data API

## a dataset: a sequence of data items

In [1]:
# import the lib

import tensorflow as tf

2021-12-12 14:30:59.327525: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-12 14:30:59.327627: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [37]:
# create a tf.data.Dataset

## create a (10, ) type tensor 
X = tf.range(10)

## create a tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices(X)

## iterate the dataset to have a look at the generated data items
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


## Section 1.1: Chaining Transformations

### repeat() and batch() method

In [33]:
# create the dataset

dataset = dataset.repeat(3).batch(7, drop_remainder=True)

In [34]:
# show the items

for e in dataset:
    print(e)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


### map() method

In [35]:
dataset = dataset.map(lambda x: x * 2)

for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)


### apply(tf.data.experimental.function()) method

In [36]:
dataset = dataset.apply(tf.data.experimental.unbatch())

for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, sh

### filter() method

In [38]:
X = tf.range(10)

dataset = tf.data.Dataset.from_tensor_slices(X)

dataset = dataset.map(lambda x: x * 2)

In [39]:
dataset = dataset.filter(lambda x: x < 10)

for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [40]:
# just look at a few items 

for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


## Section 1.2 Shuffling the Data

In [1]:
# import the lib

import tensorflow as tf

2021-12-16 17:34:31.280870: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-16 17:34:31.281266: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# create the dataset

dataset = tf.data.Dataset.range(10).repeat(3)

dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7, drop_remainder=True)

2021-12-16 17:34:42.451154: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-16 17:34:42.452182: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-16 17:34:42.452296: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-8E5U3B3): /proc/driver/nvidia/version does not exist
2021-12-16 17:34:42.462851: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# show the items

for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)


### split the California dataset to multiple CSV files

In [4]:
# import the libs

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import numpy as np

In [5]:
# load the dataset 

housing = fetch_california_housing()

In [6]:
# extract the data and labels

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1),
                                                             random_state=42)

In [7]:
# create training, validation and test sets

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

In [8]:
# obtain the mean and std of training data

scaler = StandardScaler()
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

In [9]:
# define a function for splitting the data into multiple files (20 files in this example)

def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    
    # initialise the path of the directory
    housing_dir = os.path.join("datasets", "housing")
    # create the directory
    os.makedirs(housing_dir, exist_ok=True)
    # initialise the name format of the path of the particular file
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
    
    # initialise the path data structure with an empty list
    filepaths = []
    
    # obtain the number of data instances
    m = len(data)
    # iterate the file index and rows in each file
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        # assign the particular path for each file
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        # write file
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    
    return filepaths       

In [10]:
# obtain the generated files by running the function

# create the training, validation and test data np.array 
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

# obtain the headers
header_cols = housing.feature_names + ["MedianHouseValue"]
# obtain the path name of the header
header = ",".join(header_cols)

# run the defined function 
train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [11]:
# show the train file paths

train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

In [13]:
# show the content of one particular csv file

with open("datasets/housing/my_train_00.csv", "r") as file:
    for i in range(5):
        print(file.readline())

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue

3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442

5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687

3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621

7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621



###  Build the Input Pipeline

In [15]:
# create a dataset of shuffled file paths  

filepaths_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [16]:
# create a new string dataset consisting of five data instances where each is extracted from a filepath

n_readers = 5
dataset = filepaths_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [17]:
# take a look at the dataset content

for line in dataset.take(5):
    print(line.numpy())

b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'
b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625'
b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526'
b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442'


## Section 1.3 Preprocessing the data

In [19]:
# define the data preprocessing function

## assign the number of inputs
n_inputs = X_train.shape[-1]

## function definition
@tf.function
def preprocess(line):
    
    # assign the defaults of input instance type 
    defs = [0.0] * n_inputs + [tf.constant([], dtype=tf.float32)]
    
    # assign the field with decoding the csv file 
    fields = tf.io.decode_csv(line, record_defaults=defs)
    
    # assign the training data
    X = tf.stack(fields[: -1])
    y = tf.stack(fields[-1: ])
    
    return (X - X_mean) / X_std, y

In [20]:
# show the preprocessing result for 1 instance

preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579157,  1.216324  , -0.05204565, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

## Section 1.4 Putting Everything together

#### load the dataset

In [1]:
# import the libs 
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import numpy as np

# load the dataset
housing = fetch_california_housing()

#### training, validation and test sets

In [2]:
# full training and test sets

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1),
                                                              random_state=42)

In [3]:
# split full training set into training and validation sets

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

#### multiple CSV files

In [4]:
# define a function for splitting the data into multiple files (20 files in this example)

def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    
    # initialise the path of the directory
    housing_dir = os.path.join("datasets", "housing")
    # create the directory
    os.makedirs(housing_dir, exist_ok=True)
    # initialise the name format of the path of the particular file
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
    
    # initialise the path data structure with an empty list
    filepaths = []
    
    # obtain the number of data instances
    m = len(data)
    # iterate the file index and rows in each file
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        # assign the particular path for each file
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        # write file
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    
    return filepaths       

#### preprocessing function

In [5]:
# define the data preprocessing function

## import the lib
import tensorflow as tf

## obtain the mean and standard deviation of traning data
scaler = StandardScaler()
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

## assign the number of inputs
n_inputs = X_train.shape[-1]

## function definition
@tf.function
def preprocess(line):
    
    # assign the defaults of input instance type 
    defs = [0.0] * n_inputs + [tf.constant([], dtype=tf.float32)]
    
    # assign the field with decoding the csv file 
    fields = tf.io.decode_csv(line, record_defaults=defs)
    
    # assign the training data
    X = tf.stack(fields[: -1])
    y = tf.stack(fields[-1: ])
    
    return (X - X_mean) / X_std, y

2021-12-18 09:20:45.665650: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-18 09:20:45.665769: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### shuffled dataset generation

In [6]:
# obtain the splitted file paths by running the defined function

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [7]:
def csv_reader_dataset(filepaths, n_repeats=1, n_readers=5,
                      n_read_threads=None, shuffle_buffer_size=10000,
                      n_parse_threads=5, batch_size=32):
    
    # obtain the file list dataset
    dataset = tf.data.Dataset.list_files(filepaths).repeat(n_repeats)
    
    # obtain the extracted text string dataset
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    
    # shuffle the dataset
    dataset = dataset.shuffle(shuffle_buffer_size)
    
    # map the dataset with preprocess function
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    
    # batch the dataset
    dataset = dataset.batch(batch_size)
    
    return dataset.prefetch(1)

In [8]:
train_set = csv_reader_dataset(train_filepaths, n_repeats=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

2021-12-18 09:21:00.231493: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-18 09:21:00.232114: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-18 09:21:00.232351: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-8E5U3B3): /proc/driver/nvidia/version does not exist
2021-12-18 09:21:00.237298: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 1.5 Using the Dataset with tf.keras

In [9]:
# import the lib
from tensorflow import keras

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

# compile the model
model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))

# specify the batch size
batch_size = 32

# fit the model
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f08c0208e20>

In [10]:
# evaluate the model

model.evaluate(test_set, steps=len(X_test)//batch_size)



0.4775973856449127

# Section 2 TFRecord Format

### create a TFRecord file

In [11]:
# import the lib
import tensorflow as tf

with tf.io.TFRecordWriter("my_data.tfrecord") as file:
    file.write(b"This is the first record")
    file.write(b"And this is the second record")

In [12]:
# read the file

filepaths = ["my_data.tfrecord"]

dataset = tf.data.TFRecordDataset(filepaths)

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


## 2.1 Compressed TFRecord Files

In [13]:
# create a compressed TFRecord file

## set the option
options = tf.io.TFRecordOptions(compression_type="GZIP")

## write the file
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as file:
    file.write(b"This is the first record")
    file.write(b"And this is the second record")

In [14]:
# read the compressed file

## dataset assignment
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"], 
                                  compression_type="GZIP")

## read 
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


# Section 3: The Feature API

### load the dataset

In [1]:
# import the libs 
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import numpy as np

# load the dataset
housing = fetch_california_housing()

# full training and test sets

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1),
                                                              random_state=42)
# split full training set into training and validation sets

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)


### mean and standard deviation of the training data

In [2]:
## obtain the mean and standard deviation of traning data
scaler = StandardScaler()
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

### tf.feature_column package

In [3]:
# import the lib
import tensorflow as tf

# obtain the mean and standard deviation 
age_mean, age_std = X_mean[1], X_std[1]

# define a categorical feature
housing_median_age = tf.feature_column.numeric_column("housing_median_age",
                                                     normalizer_fn=lambda x: (x - age_mean) / age_std)

2022-02-03 10:48:45.197824: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-03 10:48:45.198717: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### bucketize income feature into categorical feature

In [4]:
# assign the numeric column of median_income
median_income = tf.feature_column.numeric_column("median_income")

# bucketize the feature
bucketized_income = tf.feature_column.bucketized_column(median_income,
                                                       boundaries=[1.5, 3.0, 4.5, 6.0])

## 3.1 Categorical Features

### example: ocean_proximity 

In [5]:
# assign the vocabulary list
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

# define categorical feature with vocabulary list 
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list("ocean_proximity", 
                                                                            ocean_prox_vocab)

In [6]:
ocean_proximity

VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

### hash bucket for large vocabulary

In [7]:
city_hash = tf.feature_column.categorical_column_with_hash_bucket("city", hash_bucket_size=1000)

## 3.2 Crossed Categorical Features

### example 1: cross bucketized feature with another feature

In [8]:
# bucketize the housing_median_age feature
bucketized_age = tf.feature_column.bucketized_column(housing_median_age,
                                                    boundaries=[-1.0, -0.5, 0.0, 0.5, 1.0])

# cross the bucketized feature with ocean_proximity
age_and_ocean_proximity = tf.feature_column.crossed_column([bucketized_age, ocean_proximity],
                                                          hash_bucket_size=100)

### example 2: cross latitude and longitude into a single categorical feature

In [9]:
# obtain the latitude and longitude numeric features
latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")

# bucketize the features
bucketized_latitude = tf.feature_column.bucketized_column(latitude, 
                                                         boundaries=list(np.linspace(32.0, 42.0, 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(longitude, 
                                                           boundaries=list(np.linspace(-125.0, -114.0, 20 - 1)))

# cross the bucketized features together
location = tf.feature_column.crossed_column([bucketized_latitude, bucketized_longitude], 
                                           hash_bucket_size=1000)

## 3.3 Encoding Catergorical Features: One-Hot Vectors

### tf.feature_column.indicator_column() method

In [10]:
# create the categorical feature for filling into the method

## assign the vocabulary list
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

## define categorical feature with vocabulary list 
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list("ocean_proximity", 
                                                                            ocean_prox_vocab)

In [11]:
# one-hot vector for ocean_proximity

ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

## 3.4 Encoding Categorical Features: Using Embeddings

### tf.feature_column.embedding_column() method

In [12]:
# create the categorical feature for filling into the method

## assign the vocabulary list
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

## define categorical feature with vocabulary list
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list("ocean_proximity", 
                                                                           ocean_prox_vocab)

In [13]:
# 2D embedding

ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,
                                                          dimension=2)

In [14]:
# display the embeded feature
print(ocean_proximity_embed)

EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0), dimension=2, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7fc610b15280>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True, use_safe_embedding_lookup=True)


## 3.5 Parsing: Using Feature Columns

### parse all the feature columns to generate feature descriptions

In [21]:
from tensorflow.train import Feature, Features, Example
from tensorflow.train import BytesList, FloatList, Int64List

In [15]:
# create numeric feature column for the target "median_house_value"

median_house_value = tf.feature_column.numeric_column("median_house_value") 

In [16]:
# create feature columns with input features and target 
columns = [housing_median_age, median_house_value]

# generate feature descriptions
feature_descriptions = tf.feature_column.make_parse_example_spec(columns)

In [17]:
# display the feature description

feature_descriptions

{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),
 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}

In [22]:
# create a TFRecord file

with tf.io.TFRecordWriter("my_data_with_features.tfrecords") as f:
    for x, y in zip(X_train[:, 1:2], y_train):
        example = Example(features=Features(feature={
            "housing_median_age": Feature(float_list=FloatList(value=[x])),
            "median_house_value": Feature(float_list=FloatList(value=[y]))
        }))
        f.write(example.SerializeToString())

In [23]:
# define a function that parses serialized examples and separates the target column from the input features

def parse_examples(serialized_examples):
    
    examples = tf.io.parse_example(serialized_examples, feature_descriptions)
    targets = examples.pop("median_house_value")
    
    return examples, targets

In [24]:
# load the dataset from a TFRecord file

batch_size = 32
dataset = tf.data.TFRecordDataset(["my_data_with_features.tfrecords"])
dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)

2022-02-03 13:59:10.936001: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-02-03 13:59:10.937612: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-03 13:59:10.937871: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-8E5U3B3): /proc/driver/nvidia/version does not exist
2022-02-03 13:59:10.948051: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:
# extract all the input feature column to be applied as the feature columns for layer initialization

columns_without_target = columns[:-1]

In [29]:
# model creation, fitting and compiling
from tensorflow import keras

model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns=columns_without_target),
    keras.layers.Dense(1)
])

model.compile(loss="mse",
             optimizer=keras.optimizers.SGD(learning_rate=1e-3),
             metrics=["accuracy"])

model.fit(dataset, steps_per_epoch=len(X_train)//batch_size, epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc60e8e21f0>

# Section 4 TF Transform