In [0]:
%tensorflow_version 2.x

In [0]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [33]:
X= tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [34]:
for item in dataset:
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


### Chaining Transformations

In [35]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
  print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


### Map
- Individual item in a dataset

In [36]:
dataset.map(lambda x : x*2)

<MapDataset shapes: (None,), types: tf.int32>

In [37]:
for item in dataset:
  print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


### Apply
- Dataset as whole

In [0]:
dataset = dataset.apply(tf.data.experimental.unbatch())

In [39]:
for item in dataset:
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype

### Filter

In [0]:
dataset = dataset.filter(lambda x : x < 5)

In [41]:
for item in dataset:
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


### Take

In [42]:
for item in dataset.take(3):
  print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


### Shuffling The Data

In [43]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5,seed=17).batch(7)
for item in dataset:
  print(item)

tf.Tensor([0 4 6 1 2 7 8], shape=(7,), dtype=int64)
tf.Tensor([0 5 1 3 4 6 3], shape=(7,), dtype=int64)
tf.Tensor([8 5 7 0 1 2 9], shape=(7,), dtype=int64)
tf.Tensor([5 2 4 8 6 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 7], shape=(2,), dtype=int64)


### Preprocessing the Data from multiple files

In [0]:
X_mean, X_std = [1,2]
n_inputs = 8

In [0]:
def preprocess(line):
  defs = [0.] *n_inputs + [tf.constant([],dtype=tf.float32)]
  fields = tf.io.decode_csv(line,record_defaults=defs)
  x = tf.stack(fields[:-1])
  y = tf.stack(fileds[-1:])
  return (x-X_mean)/X_std,y

In [0]:
def csv_reader_dataset(filepaths,repeat=1,n_readers=5,n_read_threads=None,shuffle_buffer_size=10000,n_parse_threads=5,batch_size=32):
  dataset = tf.data.Dataset.list_files(filepaths)
  dataset = dataset.interleave(
      lambda filepath : tf.data.TextLineDataset(filepath).skip(1),
      cycle_length = n_readers,
      num_parallel_calls = n_read_threads
  )
  dataset = dataset.map(preprocess,num_parallel_calls=n_parse_threads)
  dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
  return dataset.batch(batch_size).prefetch(3)

###### Alternative metod with tf.keras

In [48]:
train_paths = "aaa"
train_set = keras.Dataset.csv_reader_dataset(train_paths)

AttributeError: ignored

# The TFRecord Format

In [0]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
  f.write(b"This is first record")
  f.write(b"This is second record")

In [50]:
filepath = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepath)
for item in dataset:
  print(item)

tf.Tensor(b'This is first record', shape=(), dtype=string)
tf.Tensor(b'This is second record', shape=(), dtype=string)


# Preprocessing Input Features

##### Scaling as part of the model

In [51]:
means = np.mean(X_train,axis=0,keepdims = True)
stds = np.std(X_train,axis=0,keepdims = True)
eps = keras.backend.epsilon()

NameError: ignored

In [52]:
model = keras.models.Sequential(
    [
     keras.layers.Lambda(lambda inputs: (inputs-means)/(stds+eps)),
     ...............
    ]
)

SyntaxError: ignored

In [0]:
class Standardization(keras.layers.Layer):
  def adapt(self,data_sample):
    self.means_ = np.mean(data_sample,axis=0,keepdims=True)
    self.stds_ = np.std(data_sample,axis=0,keepdims=True)
  
  def call(self,inputs):
    return (inputs-self.means_)/(self.stds_+keras.backend.epsilon())

In [54]:
std_layer = Standardization()
std_layer.adapt(data_sample)

NameError: ignored

In [0]:
model = keras.models.Sequential()
model.add(std_layer)

#### Categorical Encoding using One Hot Vectors

In [0]:
vocab = ["OCEAN","INLAND","NEAR OCEAN","NEAR BAY","ISLAND"]
indicies = tf.range(len(vocab),dtype=tf.int64)

In [57]:
table_init = tf.lookup.KeyValueTensorInitializer(vocab,indicies)
table_init

<tensorflow.python.ops.lookup_ops.KeyValueTensorInitializer at 0x7faa85f70f60>

In [58]:
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init,num_oov_buckets)
table

<tensorflow.python.ops.lookup_ops.StaticVocabularyTable at 0x7faa86050c50>

In [59]:
categories = tf.constant(["NEAR BAY","DESERT","INLAND","INLAND"])
cat_indicies = table.lookup(categories)
cat_indicies

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [60]:
cat_one_hot = tf.one_hot(cat_indicies,depth=len(vocab)+num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

#### Categorical Varibles Embedding

In [61]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab)+num_oov_buckets,embedding_dim])
print(embed_init)
embedding_matrix = tf.Variable(embed_init)
print(embedding_matrix)

tf.Tensor(
[[0.4669267  0.35764956]
 [0.8009105  0.24726784]
 [0.26736963 0.3426937 ]
 [0.2615825  0.13520384]
 [0.4496367  0.86572397]
 [0.6415409  0.4085518 ]
 [0.01123655 0.8232403 ]], shape=(7, 2), dtype=float32)
<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.4669267 , 0.35764956],
       [0.8009105 , 0.24726784],
       [0.26736963, 0.3426937 ],
       [0.2615825 , 0.13520384],
       [0.4496367 , 0.86572397],
       [0.6415409 , 0.4085518 ],
       [0.01123655, 0.8232403 ]], dtype=float32)>


In [63]:
categories = tf.constant(["NEAR BAY","DESERT","INLAND","INLAND"])
cat_indicies = table.lookup(categories)
cat_indicies

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [64]:
tf.nn.embedding_lookup(embedding_matrix,cat_indicies)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.2615825 , 0.13520384],
       [0.6415409 , 0.4085518 ],
       [0.8009105 , 0.24726784],
       [0.8009105 , 0.24726784]], dtype=float32)>

In [0]:
embedding = keras.layers.Embedding(input_dim=len(vocab)+num_oov_buckets, output_dim=embedding_dim)

In [66]:
embedding(cat_indicies)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.02827909,  0.02463795],
       [-0.00284346, -0.01209123],
       [-0.01804191,  0.0401202 ],
       [-0.01804191,  0.0401202 ]], dtype=float32)>

###### As Part of the model

In [0]:
regular_inputs = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[],dtype=tf.string)
cat_indicies = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim=6,output_dim=2)(cat_indicies)
encoded_inputs = keras.layers.concatenate([regular_inputs,cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs,categories],outputs=[outputs])

# Keras Preprocessing Layers

In [68]:
normalization = keras.layers.normalization()
discretization = keras.layers.discretization([])
pipeline = keras.layers.PreprocessingStage([normalization,discretization])
pipeline.adapt(data_sample)

AttributeError: ignored