In [1]:
# set up

import sklearn
import tensorflow as tf
from tensorflow import keras

import numpy as np
np.random.seed(42)
import os

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.style.use('seaborn-darkgrid')

# DATA API

In [2]:
X = tf.range(10) # sample data tensor
dataset = tf.data.Dataset.from_tensor_slices(X) # dataset object
dataset

 # dataset = tf.data.Dataset.range(10)

<TensorSliceDataset shapes: (), types: tf.int32>

In [3]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [4]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [5]:
dataset = dataset.map(lambda x: x * 2)
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [6]:
dataset = dataset.unbatch()

In [8]:
dataset = dataset.filter(lambda x: x < 10) 
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [9]:
tf.random.set_seed(42)

dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([1 3 0 4 2 5 6], shape=(7,), dtype=int64)
tf.Tensor([8 7 1 0 3 2 5], shape=(7,), dtype=int64)
tf.Tensor([4 6 9 8 9 7 0], shape=(7,), dtype=int64)
tf.Tensor([3 1 4 5 2 8 7], shape=(7,), dtype=int64)
tf.Tensor([6 9], shape=(2,), dtype=int64)


# input features preprocessing

In [None]:
# method 1 
means = np.mean(X_train, axis = 0, keepdims = True)
stds = np.std(X_train, axis = 0, keepdims = True)
eps = keras.backend.epsilon()
model = keras.models.Sequential([
    keras.layers.Lambda(lambda inputs: (inputs - means) / (stds + eps))
    '''other layers'''
])

In [None]:
# method 2 - more useful
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis = 0, keepdims = True)
        self.stds_ = np.std(data_sample. axis = 0, keepdims = True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())
    

std_layer = Standardization()
std_layer.adapt(data_sample)

model = keras.Sequential()
model.add(std_layer)
'''build model'''
model.compile(''' ''')
model.fit(''' ''')

### categorical features encoding with 1-hot vector

In [4]:
# mapping by lookup table (string to id)
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype = tf.int64) # index generate
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

데이터셋이 크거나 범주 개수가 많거나 자주바뀐다면, 전체 범주 리스트를 확보하는 것이 어려울 수 있음  
oov 버킷을 사용하여 훈련셋에서 보지못한 범주에 대해 할당하게끔 대처할 수 있음

In [12]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
print(categories)
cat_indices = table.lookup(categories)
print(cat_indices)
cat_one_hot = tf.one_hot(cat_indices, depth = len(vocab) + num_oov_buckets)
print(cat_one_hot)

tf.Tensor([b'NEAR BAY' b'DESERT' b'INLAND' b'INLAND'], shape=(4,), dtype=string)
tf.Tensor([3 5 1 1], shape=(4,), dtype=int64)
tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]], shape=(4, 7), dtype=float32)


### embedding

In [7]:
# set embedding dimension
embedding_dim = 2
# random initializing
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
print(embedding_matrix,'\n')

# get numerical indices from lookup table
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
print(cat_indices,'\n')

# get embedding vector from embedding matrix by indexing
print(tf.nn.embedding_lookup(embedding_matrix, cat_indices))

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.22760808, 0.6313267 ],
       [0.98946416, 0.02466059],
       [0.7273104 , 0.667506  ],
       [0.40607095, 0.28994656],
       [0.4632535 , 0.4906999 ],
       [0.61814964, 0.78535223],
       [0.3465054 , 0.47939825]], dtype=float32)> 

tf.Tensor([3 5 1 1], shape=(4,), dtype=int64) 

tf.Tensor(
[[0.40607095 0.28994656]
 [0.61814964 0.78535223]
 [0.98946416 0.02466059]
 [0.98946416 0.02466059]], shape=(4, 2), dtype=float32)


In [8]:
# keras Embedding module
embedding = keras.layers.Embedding(input_dim = len(vocab) + num_oov_buckets,
                                   output_dim = embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.03676935, -0.03128712],
       [-0.01234759, -0.00988318],
       [ 0.04939741,  0.04505488],
       [ 0.04939741,  0.04505488]], dtype=float32)>

In [None]:
## functional API model with encoding process
# input
regular_inputs = keras.layers.Input(shape = [8])
categories = keras.layers.Input(shape = [], dtype = tf.string)

# mapping
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim = 6, output_dim = 2)(cat_indices)

# encoding
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])

# body
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs = [regular_inputs, categories],
                           outputs = [outputs])

# compiling & fitting ...

In [None]:
# preprocessing pipeline 
nomalization = keras.layers.Normalization()
discretization = keras.layers.Discretization([...])
pipeline = keras.layers.PreprocessingStage([normalization, discretization])
pipelin.adapt(data_sample)

# Tensorflow dataset
####  
* 널리 사용되는 데이터셋을 쉽게 다운받을 수 있다.
* tensorflow-datasets 라이브러리 설치해야함

In [9]:
!pip install tensorflow_datasets

Collecting tensorflow_datasets
  Downloading tensorflow_datasets-4.2.0-py3-none-any.whl (3.7 MB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
Collecting dill
  Downloading dill-0.3.3-py2.py3-none-any.whl (81 kB)
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-0.26.0-py3-none-any.whl (47 kB)
Collecting tqdm
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
Collecting importlib-resources; python_version < "3.9"
  Downloading importlib_resources-5.0.0-py3-none-any.whl (22 kB)
Collecting typing-extensions; python_version < "3.8"
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.52.0-py2.py3-none-any.whl (100 kB)
Building wheels for collected packages: future, promise
  Building wheel for future (setup.py): started
  Building wheel for future (setup.py): finished with status 'done'
  Created 

In [11]:
import tensorflow_datasets as tfds

dataset = tfds.load(name = "mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\user\tensorflow_datasets\mnist\3.0.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling mnist-train.tfrecord...:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling mnist-test.tfrecord...:   0%|          | 0/10000 [00:00<?, ? examples/s]

[1mDataset mnist downloaded and prepared to C:\Users\user\tensorflow_datasets\mnist\3.0.1. Subsequent calls will reuse this data.[0m


In [16]:
mnist_train = mnist_train.shuffle(10000).batch(32)
mnist_train = mnist_train.map(lambda items : (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

In [None]:
# easier option for labeled dataset
dataset = tfds.load(name = "mnist", batch_size = 32, as_supervised = True)
mnist_train = dataset["train"].prefetch(1)

model = keras.models.Sequential([...])
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "sgd")
model.fit(mnist_train, epoch = 5)