In [1]:
import tensorflow as tf
sess = tf.Session() 
import warnings
warnings.filterwarnings('ignore')

In [110]:
sess.run(tf.global_variables_initializer())

# dataset 读取csv 

In [5]:
def parse_example(line):
    columns = tf.io.decode_csv(line, record_defaults= CSV_RECORD_DEFAULTS)

    features = dict(zip(FEATURE_NAME, columns))

    target = tf.reshape(tf.cast(tf.equal(features.pop(TARGET),'>50K'),  tf.int32),[-1,1])

    return features, target

In [93]:
input_path = DATA_DIR.format('train')
dataset = tf.data.TextLineDataset(input_path) \
    .skip(1) \
    .batch(5) \
    .map( parse_example, num_parallel_calls=8 )

In [94]:
iterator = dataset.make_one_shot_iterator()
features, label = sess.run(iterator.get_next())
features, label

({'age': array([50, 38, 53, 28, 37], dtype=int32),
  'workclass': array([b'Self-emp-not-inc', b'Private', b'Private', b'Private',
         b'Private'], dtype=object),
  'fnlwgt': array([ 83311, 215646, 234721, 338409, 284582], dtype=int32),
  'education': array([b'Bachelors', b'HS-grad', b'11th', b'Bachelors', b'Masters'],
        dtype=object),
  'education_num': array([13,  9,  7, 13, 14], dtype=int32),
  'marital_status': array([b'Married-civ-spouse', b'Divorced', b'Married-civ-spouse',
         b'Married-civ-spouse', b'Married-civ-spouse'], dtype=object),
  'occupation': array([b'Exec-managerial', b'Handlers-cleaners', b'Handlers-cleaners',
         b'Prof-specialty', b'Exec-managerial'], dtype=object),
  'relationship': array([b'Husband', b'Not-in-family', b'Husband', b'Wife', b'Wife'],
        dtype=object),
  'race': array([b'White', b'White', b'Black', b'Black', b'White'], dtype=object),
  'gender': array([b'Male', b'Male', b'Male', b'Female', b'Female'], dtype=object),
  'capi

# feature_column示例

- feature_column的输入支持：列名，feature_column
- feature_column的输出：dense, sparse

## 连续型输入

### 1. numeric_column: 输入时原始连续特征，输出是numeric_column

In [111]:
age = tf.feature_column.numeric_column('age')
sess.run(tf.feature_column.input_layer(features, age))

array([[50.],
       [38.],
       [53.],
       [28.],
       [37.]], dtype=float32)

### 2. bucketized_column: 输入是numeric_column, 输出是one-hot格式

- 注意bucketized不能对原始输入做操作必须先转成numeric_column
- boundaries=[0., 1., 2.] 分桶是 (-inf, 0.), [0., 1.), [1., 2.), and [2., +inf).

In [112]:
age_bin = tf.feature_column.bucketized_column(age, boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
sess.run(tf.feature_column.input_layer(features, age_bin))

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

## 离散型输入：可以是数值/字符串型离散值

### 1. categorical_column_with_vocabulary_list/file: 输入是数量较少的离散值(str/int)，输出categorical_column

In [120]:
cat_gender = tf.feature_column.categorical_column_with_vocabulary_list('gender', ['Male','Female'])
cat_gender

VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Male', 'Female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

### 2. categorical_column_with_hash_bucket: 输入是数量较多的离散值(str/int), 输出categorical_column

- 往往用于vocabulary_list未知或者太多不好枚举，这时hash_bucket_size < vocabulary_list，部分单词会有相同的hash值
- 但有时懒得去写上面的vocabulary_list，这时只要hash_bucket_size > vocabulary_list那和上面效果是一样的

In [115]:
cat_workclass = tf.feature_column.categorical_column_with_hash_bucket('workclass', hash_bucket_size = 100)
cat_workclass

HashedCategoricalColumn(key='workclass', hash_bucket_size=100, dtype=tf.string)

### 3. categorical_column_with_identity：输入是int类型的离散值, 输出categorical_column

- 离散值的取值范围在[0, num_buckets],每个int会被当作一个categorical
- 取值范围外的会用default_value，未指定default_value会报错， default_value必须在上述范围内。

In [116]:
cat_education_num = tf.feature_column.categorical_column_with_identity('education_num', num_buckets= 20, default_value=0)
cat_education_num

IdentityCategoricalColumn(key='education_num', number_buckets=20, default_value=0)

## 特征交叉/映射

### 1. crossed_column：输入是原始离散特征/bucketized_column/categorical_column, 输出sparse vector

In [89]:
# 输入原始离散特征
edu_occu= tf.feature_column.crossed_column(['education','occupation'], hash_bucket_size =10) 
# 输入categorical_column
workclass_gender = tf.feature_column.crossed_column([cat_gender,cat_education_num], hash_bucket_size =10)

### 2. embedding_column: 输入是categorical_column, 输出是dense vector

In [121]:
emb_gender = tf.feature_column.embedding_column(cat_gender, dimension=3)

In [1]:
sess.run(tf.tables_initializer())
sess.run(tf.ddddglobal_variables_initializer())
sess.run(tf.feature_column.input_layer(features, emb_gender))

NameError: name 'sess' is not defined

### 3. indicator_column: 输入是categorical_column/crossed_column, 输出是one-hot vector 

In [2]:
cat_gender

NameError: name 'cat_gender' is not defined

In [3]:
ind_gender = tf.feature_column.indicator_column(cat_gender)
inputs = tf.feature_column.input_layer(features, ind_gender)
sess.run(inputs)

NameError: name 'tf' is not defined

## Categorical_column 输入

# feature_column to pre-defined tf.estimator