## 对结构化数据进行分类

Ref: [对结构化数据进行分类](https://www.tensorflow.org/tutorials/structured_data/feature_columns)

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split

print(tf.__version__)

2.0.0


### 使用 Pandas 创建一个 Dataframe

In [3]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [4]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), len(val), len(test))

193 49 61


### 用 `tf.data` 创建输入 Pipeline (流水线)

In [11]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df)*2)
    ds = ds.batch(batch_size)
    return ds

In [12]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, batch_size=batch_size, shuffle=False)
test_ds = df_to_dataset(test, batch_size=batch_size, shuffle=False)

#### 理解输入流水线

In [17]:
for feature_batch, label_batch in train_ds.take(1):
#     print(feature_batch)
    print("Every feature:", list(feature_batch.keys()))
    print("A batch of ages:", feature_batch['age'])
    print("A batch of targets:", label_batch)

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([42 64 59 57 41], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 1 1 0], shape=(5,), dtype=int32)


#### 演示几种特征列

In [25]:
example_batch = next(iter(train_ds))[0]

def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column, dtype=tf.float64)
    print(feature_layer(example_batch).numpy())

##### 1. 数值列 numeric column

In [24]:
age = tf.feature_column.numeric_column('age')
demo(age)

[[62.]
 [40.]
 [61.]
 [57.]
 [65.]]


#### 2. 分桶列 （bucketized column)

通常，您不希望将数字直接输入模型，而是根据数值范围将其值分成不同的类别。考虑代表一个人年龄的原始数据。我们可以用 分桶列（bucketized column）将年龄分成几个分桶（buckets），而不是将年龄表示成数值列。请注意下面的 one-hot 数值表示每行匹配的年龄范围。

In [39]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


##### 3. 分类列

In [40]:
thal = tf.feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible']
)
thal_one_hot = tf.feature_column.indicator_column(thal)
demo(thal_one_hot)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


##### 4. 嵌入列 （embedding column)

当分类列具有这么多可能的值时，最好使用嵌入例。

In [28]:
thal_embedding = tf.feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.05097565  0.01670606  0.39922914  0.32218978  0.2309885   0.04691038
  -0.16491722  0.03112343]
 [-0.18626773 -0.23271208 -0.11012377  0.07380028 -0.0948895  -0.07689485
  -0.46436727 -0.05768437]
 [ 0.14724754  0.3042568   0.10003802 -0.15386629  0.32359654 -0.16440938
   0.5203031   0.5650476 ]
 [ 0.14724754  0.3042568   0.10003802 -0.15386629  0.32359654 -0.16440938
   0.5203031   0.5650476 ]
 [ 0.14724754  0.3042568   0.10003802 -0.15386629  0.32359654 -0.16440938
   0.5203031   0.5650476 ]]


##### 5. 经过哈希处理的特征列
表示具有大量数值的分类列的另一种方法是使用 categorical_column_with_hash_bucket。该特征列计算输入的一个哈希值，然后选择一个 hash_bucket_size 分桶来编码字符串。使用此列时，您不需要提供词汇表，并且可以选择使 hash_buckets 的数量远远小于实际类别的数量以节省空间。

关键点：该技术的一个重要缺点是可能存在冲突，不同的字符串被映射到同一个范围。实际上，无论如何，经过哈希处理的特征列对某些数据集都有效。

In [35]:
thal_hashed = tf.feature_column.categorical_column_with_hash_bucket(
      'thal', hash_bucket_size=20)
demo(tf.feature_column.indicator_column(thal_hashed))

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


##### 6. 组合的特征列 （feture crosses）

In [43]:
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=20)
demo(tf.feature_column.indicator_column(crossed_feature))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


### 选择要使用的列

In [59]:
feature_columns = []

# 数值列
num_feature_keys = ['age', 'trestbps', 
                    'chol', 'thalach', 'oldpeak', 'slope', 'ca'] 
for feature in num_feature_keys:
    feature_columns.append(tf.feature_column.numeric_column(feature))
    
# 分桶列
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[
    18, 25, 30, 35, 40, 45, 50, 55, 60, 65
])
feature_columns.append(age_buckets)

# 分类列
thal = tf.feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = tf.feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# 嵌入列
thal_embedding = tf.feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# 组合列
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = tf.feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

##### 建立一个新的特征层

现在我们已经定义了我们的特征列，我们将使用 `DenseFeatures` 层将特征输入到我们的 Keras 模型中。

In [60]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [61]:
# 调整 batch_size 到 32
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

#### 创建、编译和训练模型


In [63]:
model = tf.keras.Sequential([
    feature_layer,
    tf.keras.layers.Dense(128, activation='relu', dtype=tf.float64),
    tf.keras.layers.Dense(128, activation='relu', dtype=tf.float64),
    tf.keras.layers.Dense(1, activation='sigmoid', dtype=tf.float64)
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

W1126 15:41:45.201194 4354246080 base_layer.py:1814] Layer sequential_4 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x145f4bed0>

In [64]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.8032787


#### 下一步
了解有关分类结构化数据的更多信息的最佳方法是亲自尝试。我们建议寻找另一个可以使用的数据集，并使用和上面相似的代码，训练一个模型，对其分类。要提高准确率，请仔细考虑模型中包含哪些特征，以及如何表示这些特征。