In [1]:
# 使用预处理层，可更方便的创建端到端的模型
import pandas as pd
from tensorflow import keras
import tensorflow as tf
import numpy as np

In [None]:
# 可用的预处理层

# 核心的预处理层
keras.layers.experimental.preprocessing.TextVectorization
keras.layers.experimental.preprocessing.Normalization

In [None]:
# 结构化数据预处理层

# 将整数特征转化为one-hot，multi—hot，tf-idf来表示
keras.layers.experimental.preprocessing.CategoryEncoding

# 执行分类特征hash，也称为“hash trick”
keras.layers.experimental.preprocessing.Hashing

# 将连续的数字特征转化为整数分类特征
keras.layers.experimental.preprocessing.Discretization

# 将将字符值值转化为整数索引
keras.layers.experimental.preprocessing.StringLookup

# 将整数分类值转化为整数索引
keras.layers.experimental.preprocessing.IntegerLookup

# 特征交叉，将分类特征合并为为共想现特征，例如有特征值a，b，则可以提供组合特征“ab同时存在”
keras.layers.experimental.preprocessing.CategoryCrossing


In [None]:
# 图像预处理层

# 将一批图像调整为目标尺寸
keras.layers.experimental.preprocessing.Resizing

# 重新缩放和偏移一批图像的均值，例如[0, 255]->[0, 1]
keras.layers.experimental.preprocessing.Rescaling

# 将一批图像进行中心剪裁
keras.layers.experimental.preprocessing.CenterCrop

In [None]:
# 将图像数据进行数据增强
keras.layers.experimental.preprocessing.RandomCrop
keras.layers.experimental.preprocessing.RandomFlip
keras.layers.experimental.preprocessing.RandomTranslation
keras.layers.experimental.preprocessing.RandomRotation
keras.layers.experimental.preprocessing.RandomZoom
keras.layers.experimental.preprocessing.RandomHeight
keras.layers.experimental.preprocessing.RandomWidth

In [None]:
# adapt()方法
# 某些预处理层具有内部状态，必须根据训练数据的样本进行计算
from tensorflow.keras.layers.experimental import preprocessing
preprocessing.TextVectorization
preprocessing.Normalization  # 保存特征的均值和方差
preprocessing.StringLookup, preprocessing.IntegerLookup  # 保存输入和输出索引之间的映射
preprocessing.CategoryEncoding  # 保存输入值的索引
preprocessing.Discretization  # 保存值区间边界

# 关键在于 这些层是不可训练的，在训练前必须adapt

In [5]:
# 例如Normalization
data = np.random.randint(low=-10, high=10, size=(2, 3))
prep = keras.layers.experimental.preprocessing.Normalization()
prep.adapt(data)
nor_data = prep(data)
print(keras.backend.std(nor_data))
print(keras.backend.mean(nor_data))


tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


In [9]:
# 再例如，对StringLookUp和TextVectorization，可以传入一个字符串列表
data = [
    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι",
    "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.",
    "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:",
    "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:",
    "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,",
    "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:",
    "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,",
    "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.",
]
layer = keras.layers.experimental.preprocessing.TextVectorization(max_tokens=100, ngrams=(1, 2, 3))
layer.adapt(data)
print(layer.get_vocabulary())
print(len(layer.get_vocabulary()))
vec_data = layer(data)
print(vec_data)


['', '[UNK]', 'ῥ᾽', 'ἔλθωσι', 'οἵ ῥ᾽', 'οἵ', 'μὲν', 'διὰ', 'αἱ', 'ῥ᾽ ἔτυμα κραίνουσι', 'ῥ᾽ ἔτυμα', 'ῥ᾽ ἐλεφαίρονται ἔπε᾽', 'ῥ᾽ ἐλεφαίρονται', 'ὅτε κέν τις', 'ὅτε κέν', 'ὅτε', 'ὄνειροι ἀμήχανοι ἀκριτόμυθοι', 'ὄνειροι ἀμήχανοι', 'ὄνειροι', 'ὀνείρων', 'ἴδηται', 'ἦ τοι μὲν', 'ἦ τοι', 'ἦ', 'ἔτυμα κραίνουσι βροτῶν', 'ἔτυμα κραίνουσι', 'ἔτυμα', 'ἔπε᾽ ἀκράαντα φέροντες', 'ἔπε᾽ ἀκράαντα', 'ἔπε᾽', 'ἔλθωσι θύραζε', 'ἔλθωσι διὰ πριστοῦ', 'ἔλθωσι διὰ', 'ἐλεφαίρονται ἔπε᾽ ἀκράαντα', 'ἐλεφαίρονται ἔπε᾽', 'ἐλεφαίρονται', 'ἐλέφαντος', 'ἐλέφαντι', 'ἀνθρώποισι', 'ἀμενηνῶν εἰσὶν ὀνείρων', 'ἀμενηνῶν εἰσὶν', 'ἀμενηνῶν', 'ἀμήχανοι ἀκριτόμυθοι', 'ἀμήχανοι', 'ἀκριτόμυθοι', 'ἀκράαντα φέροντες', 'ἀκράαντα', 'φέροντες', 'τῶν οἳ μέν', 'τῶν οἳ', 'τῶν', 'τοι μὲν ὄνειροι', 'τοι μὲν', 'τοι', 'τις ἴδηται', 'τις', 'τι πάντα τελείεται', 'τι πάντα', 'τι', 'τετεύχαται αἱ δ᾽', 'τετεύχαται αἱ', 'τετεύχαται', 'τελείεται ἀνθρώποισι', 'τελείεται', 'τε πύλαι ἀμενηνῶν', 'τε πύλαι', 'τε', 'πύλαι ἀμενηνῶν εἰσὶν', 'πύλαι ἀμενηνῶν', 

In [12]:
# 如果手上已有图层状态设置，则可以不通过adapt而直接设置，最简单的比如vocabulary
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d", "p"], ["d", "z", "b", "o"]])
layer = keras.layers.experimental.preprocessing.StringLookup(vocabulary=vocab)
vec_data = layer(data)
print(vec_data)
print(layer.get_vocabulary())

tf.Tensor(
[[2 4 5 1]
 [5 1 3 1]], shape=(2, 4), dtype=int64)
['', '[UNK]', 'a', 'b', 'c', 'd']


In [None]:

# 一些用于快速熟悉的例子


In [18]:
# 图像数据增强
DataAugmentation = keras.models.Sequential(layers=[keras.layers.experimental.preprocessing.RandomFlip(mode='horizontal',
                                                                                                      name='flip_layer'),
                                                   keras.layers.experimental.preprocessing.RandomRotation(factor=0.1,
                                                                                                          name='rotation_layer'),
                                                   keras.layers.experimental.preprocessing.RandomZoom(height_factor=0.1,
                                                                                                      width_factor=0.15,
                                                                                                      name='zoom_layer')])
model_input = keras.Input(shape=(32, 32, 3),
                          dtype='int32',
                          name='input')
print(model_input.shape[1: ])
x = DataAugmentation(model_input)
x = keras.layers.experimental.preprocessing.Rescaling(scale=1 / 255.0, offset=0.0)(x)
base_model = keras.applications.ResNet50(weights=None,
                                         input_shape=model_input.shape[1: ],
                                         include_top=False)
model_output = base_model(x)
model = keras.models.Model(inputs=[model_input], outputs=[model_output])
print(model.summary())

(32, 32, 3)
Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 32, 32, 3)]       0         
_________________________________________________________________
sequential_5 (Sequential)    (None, 32, 32, 3)         0         
_________________________________________________________________
rescaling_5 (Rescaling)      (None, 32, 32, 3)         0         
_________________________________________________________________
resnet50 (Functional)        (None, 1, 1, 2048)        23587712  
Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
_________________________________________________________________
None


In [28]:
# 对字符串特征进行one-hot编码
all_data = tf.constant(['1', '2', '3', '4', '5'])
data = tf.constant([[['1', '2', ''], ['', '', '']],
                     [['1', '2', ''], ['3', '4', '5']]])
indexer = keras.layers.experimental.preprocessing.StringLookup(mask_token='')
indexer.adapt(all_data)  # 按照词频统计方法将每个字符给予索引，形成词库字典(类似token）
print(indexer(data))
print(indexer.get_vocabulary())

encoder = keras.layers.experimental.preprocessing.CategoryEncoding(output_mode='binary')
encoder.adapt(indexer(data))  # 由于字符索引数值在某些时候是无意义的，故将其转化为one-hot编码
print(encoder(data))


tf.Tensor(
[[[6 5 0]
  [0 0 0]]

 [[6 5 0]
  [4 3 2]]], shape=(2, 2, 3), dtype=int64)
['', '[UNK]', '5', '4', '3', '2', '1']
False


TypeError: '>' not supported between instances of 'list' and 'int'

In [None]:
import tensorflow as tf

In [7]:
# 对整数特征进行one-hot编码
data = tf.constant([[10, 20, 20, 50, 30, 0],
                    [10, 20, 10, 0, 20, -1],
                    [10, -1, -1, -1, -1, -1]], dtype='int32')
indexer = keras.layers.experimental.preprocessing.IntegerLookup(mask_value=-1,
                                                                oov_value=-2)
indexer.adapt(data)
print(indexer.get_vocabulary())
print(len(indexer.get_vocabulary()))
data = tf.constant([[10, 20, 20, 50, 30, 0],
                    [10, 20, 10, 0, 20, -1],
                    [100, -1, -1, -1, -1, -1]], dtype='int32')
print(indexer(data))

data = tf.constant([[[1], [2], [2], [5], [3], [0]],
                    [[1], [2], [4], [0], [0], [0]]], dtype='int32')
data = tf.constant([[1, 2, 2, 5, 3, 0],
                    [1, 2, 4, 0, 0, 0]], dtype='int32')
data_2 = tf.constant([[1, 2, 2, 5, 3, 0],
                    [1, 1, 5, 0, 0, 0]], dtype='int32')

emb = keras.layers.Embedding(input_dim=len(indexer.get_vocabulary()), output_dim=4, mask_zero=True)
emb_2 = keras.layers.Embedding(input_dim=len(indexer.get_vocabulary()), output_dim=2, mask_zero=True)

# data_2 = emb_2(data_2)
# data = emb(data)
# concat = keras.layers.Concatenate()
# new_data = concat([data, data_2])
# print(new_data._keras_mask)
#
# data_3 = tf.constant(np.random.normal(size=(data.shape[:-1] + (2, ))))
# new_data_3 = concat([data, data_3])
# print(new_data_3._keras_mask)

print(emb(data)._keras_mask)
emb_input = keras.Input(shape=(None, ), dtype='float32')
emb_output = emb(emb_input)
emb_model = keras.models.Model(inputs=[emb_input], outputs=[emb_output])
print(emb_model(data)._keras_mask)

# add = keras.layers.Add()
# temp = keras.backend.ones_like(data)
# result = add([data, temp])
# print(result)
# print(result._keras_mask)
# encoder = keras.layers.experimental.preprocessing.CategoryEncoding(output_mode='binary')
# encoder.adapt(indexer(data))
# print(encoder(indexer(data)))
#
# test_data = np.array([10, 10, 20, 50, 60, 0])
# print(indexer(test_data))
# print(encoder(indexer(test_data)))

[-1, -2, 20, 10, 0, 50, 30]
7
tf.Tensor(
[[3 2 2 5 6 4]
 [3 2 3 4 2 0]
 [1 0 0 0 0 0]], shape=(3, 6), dtype=int64)
tf.Tensor(
[[ True  True  True  True  True False]
 [ True  True  True False False False]], shape=(2, 6), dtype=bool)
tf.Tensor(
[[ True  True  True  True  True False]
 [ True  True  True False False False]], shape=(2, 6), dtype=bool)


In [35]:
soft_data = tf.constant([[[1], [1], [1], [1]],
                         [[1], [0], [0], [0]]], dtype='float32')
print(keras.layers.Reshape(target_shape=(4, ))(soft_data))
soft_data_1 = tf.constant([[1, 1, 1, 1],
                         [1, 0, 0, 0]], dtype='float32')
soft_data = keras.layers.Masking(mask_value=0)(soft_data)
print(soft_data._keras_mask)
soft_data = tf.where(soft_data._keras_mask, soft_data_1, tf.ones_like(soft_data_1) * (-2e32))
print(tf.nn.softmax(soft_data))

tf.Tensor(
[[1. 1. 1. 1.]
 [1. 0. 0. 0.]], shape=(2, 4), dtype=float32)
tf.Tensor(
[[ True  True  True  True]
 [ True False False False]], shape=(2, 4), dtype=bool)
tf.Tensor(
[[0.25 0.25 0.25 0.25]
 [1.   0.   0.   0.  ]], shape=(2, 4), dtype=float32)


In [40]:
data = tf.ones(shape=(2, 1, 10))
print(keras.backend.tile(data, (1, 3, 1)))
print(keras.layers.Input(shape=(None, ), dtype='float32').supports_masking)

tf.Tensor(
[[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]], shape=(2, 3, 10), dtype=float32)


AttributeError: 'Tensor' object has no attribute 'supports_masking'

In [1]:
temp = tf.constant([[1, 1, 1],
                    [2, 2, 0]])
print(tf.sequence_mask(keras.layers.Masking(mask_value=0)(temp)))
# temp_1 = tf.constant([[1, 1, 1]])
# print(temp - temp_1)
# print(tf.expand_dims(temp, 1))

NameError: name 'tf' is not defined

In [35]:
# 使用hash技巧应用于整数类特征
# 如果特征值中出现了许多不同的值，而每个值仅仅在数据中出现几次，则索引编制法不太好
# 所以将值hash到固定大小的向量，这样使得特征空间大小易于管理，且无需显示索引
# 简单来说，在实际应用中可能会遇到特征太多的情况，不好降维，与使用这样的方法强行降维，虽然出来的结果不好解释，并且会出现很多特征碰撞的可能性，但事实证明这并不影响解决问题
data = np.random.randint(low=0, high=100000, size=(10000, 1))
hasher = keras.layers.experimental.preprocessing.Hashing(num_bins=64, salt=1337)

encoder = keras.layers.experimental.preprocessing.CategoryEncoding(max_tokens=64, output_mode='binary')
print(hasher(data))
print(encoder(hasher(data)))

tf.Tensor(
[[51]
 [61]
 [26]
 ...
 [62]
 [45]
 [ 2]], shape=(10000, 1), dtype=int64)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]], shape=(10000, 64), dtype=float32)


In [14]:
# 将文本数据编码为数字索引向量，该方法主要适用于将数据扔进embedding层之前
data =  tf.constant(['10 20 50', '20 30', '',
                     '10', '', ''])

encoder = keras.layers.experimental.preprocessing.TextVectorization(output_mode='binary')
encoder.adapt(data)

data =  tf.constant(['10', 'no', ''])
cope_data = encoder(data)
print(cope_data)
# embeddinger = keras.layers.Embedding(input_dim=len(text_vectorizer.get_vocabulary()), output_dim=10)
#
# print(embeddinger(cope_data))

tf.Tensor(
[[0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]], shape=(3, 5), dtype=float32)


In [5]:
# 使用multi——hot将文本编码为密集的ngram矩阵
# 多热编码就是某个特征可以含有多个值，即某条向量中可以有多个1，是独热的进化
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
        ''
    ]
)

text_vectorizer = keras.layers.experimental.preprocessing.TextVectorization(output_mode='binary')
text_vectorizer.adapt(data)
print(text_vectorizer.get_vocabulary())
print(text_vectorizer(data))

['[UNK]', 'the', 'side', 'you', 'with', 'will', 'wider', 'them', 'than', 'sky', 'put', 'other', 'one', 'is', 'for', 'ease', 'contain', 'by', 'brain', 'beside', 'and']
tf.Tensor(
[[0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(5, 21), dtype=float32)


In [53]:
# 使用tf-idf加权将文本编码到ngram矩阵
# 本质是先使用ngram，然后将每条使用multi-hot表示，然后再计算tf-idf值，将其中的1替换成对应的tf-idf值
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)

text_vectorizer = keras.layers.experimental.preprocessing.TextVectorization(output_mode='tf-idf', ngrams=(1, 2))
text_vectorizer.adapt(data)
print(text_vectorizer.get_vocabulary())
print(text_vectorizer(data))
print(text_vectorizer(data).shape)

['[UNK]', 'the', 'side', 'you beside', 'you', 'with ease', 'with', 'will contain', 'will', 'wider than', 'wider', 'them side', 'them', 'the sky', 'the other', 'the one', 'the brain', 'than the', 'than', 'sky', 'side by', 'put them', 'put', 'other will', 'other', 'one the', 'one', 'is wider', 'is', 'for put', 'for', 'ease and', 'ease', 'contain', 'by side', 'by', 'brain is', 'brain', 'beside', 'and you', 'and']
tf.Tensor(
[[0.        1.6945957 0.        0.        0.        0.        0.
  0.        0.        1.0986123 1.0986123 0.        0.        1.0986123
  0.        0.        1.0986123 1.0986123 1.0986123 1.0986123 0.
  0.        0.        0.        0.        0.        0.        1.0986123
  1.0986123 0.        0.        0.        0.        0.        0.
  0.        1.0986123 1.0986123 0.        0.        0.       ]
 [0.        0.        2.1972246 0.        0.        0.        0.
  0.        0.        0.        0.        1.0986123 1.0986123 0.
  0.        0.        0.        0.        0