In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder


In [2]:
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 
    "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", 
    "hours_per_week", "native_country", "income_bracket"] ##数据所有的列

CATEGORICAL_COLUMNS = [
    "workclass", "education", "marital_status", "occupation", "relationship", 
    "race", "gender", "native_country"
] ##类别数据

NUMBER_COLUMNS = [
    "age", "education_num", "capital_gain", "capital_loss", "hours_per_week"
]##连续数据列

LABEL_COLUMN = "label"

In [11]:
##数据处理，特征工程
def featrue_preprocessing():
    train_data = pd.read_csv('./data/adult.data', names=COLUMNS)
    ##这里直接删除缺失的数据吧
    train_data.dropna(how='any',axis=0)
    train_data['train'] = 1
    test_data = pd.read_csv('./data/adult.test', names=COLUMNS)
    ##这里直接删除缺失的数据吧
    test_data.dropna(how='any',axis=0)
    test_data['train'] = 0
    
    data = pd.concat([train_data, test_data], ignore_index=True)
    data = data.replace(' ?', ' Never-worked')
    print(data['workclass'].unique())
#     return data
    data = data.dropna(axis=0,how='any')
    print(data['workclass'].unique())
    ##将收入档次income_bracket离散化
    data[LABEL_COLUMN] = data['income_bracket'].apply(lambda x: '>50' in x).astype(int)
    y = data[LABEL_COLUMN].values
    
    data = data.drop(['income_bracket'], axis=1)
    for c in CATEGORICAL_COLUMNS:
        le = LabelEncoder()
        data[c] = le.fit_transform(data[c])
    train_size = len(train_data)
    train_X = data[data['train'] == 1].drop(['label'], axis=1)
    train_y = data[data['train'] == 1]['label'].values
    test_X = data[data['train'] == 0].drop(['label'], axis=1)
    test_y = data[data['train'] == 0]['label'].values
    train_X_cate = train_X[CATEGORICAL_COLUMNS].to_numpy()
    test_X_cate = test_X[CATEGORICAL_COLUMNS].to_numpy()
    train_X_num = train_X[NUMBER_COLUMNS].to_numpy()
    test_X_num = test_X[NUMBER_COLUMNS].to_numpy()
    
    ##进行标准化处理(归一化，转换成均值为0标准差为1的正态分布)
    scaler = StandardScaler()
    train_X_num = scaler.fit_transform(train_X_num)
    test_X_num = scaler.fit_transform(test_X_num),
    
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    train_cate_poly = poly.fit_transform(train_X_cate)
    test_cate_poly = poly.fit_transform(test_X_cate)
#     data = pd.concat([data, pd.DataFrame(np.vstack([train_cate_poly, test_cate_poly]), columns=['poly_featrue_{}'.format(i) for i in range(37)] )] )
#     data['poly_featrue'] = np.vstack([train_cate_poly, test_cate_poly])
#     data['test_cate_poly'] = test_cate_poly
    return {
        'train_X': train_X,
        'test_X': test_X,
        'train_y': train_y,
        'test_y': test_y,
        'train_X_cate': train_X_cate,
        'test_X_cate': test_X_cate,
        'train_X_num': train_X_num,
        'test_X_num': test_X_num,
        'data': data,
        'train_cate_poly': train_cate_poly,
        'test_cate_poly': test_cate_poly
    }
    
    
    

In [14]:
##定义输入值的占位符
# embed_dim = 16 ##选取特征维度的的四分之一
#定义交叉积的维度
cross_dim = featrue_preprocessing().get('train_cate_poly').shape[1]

def variable_init(data):
    embed_dict = {}
    for cate in CATEGORICAL_COLUMNS:
        embed_shape = int(data[cate].nunique())
        if cate not in embed_dict:
            embed_dict[cate] = {}
        embed_dict[cate]['holder'] = tf.placeholder(tf.int32, [None, 1], name='{}_inputs'.format(cate))
        embed_dict[cate]['dims'] = 16
        embed_dict[cate]['shape'] = embed_shape
    
#     for numr in NUMBER_COLUMNS:
#         pass
    numerical_inputs = tf.placeholder(tf.float32, [None, len(NUMBER_COLUMNS)], name='numerical_inputs')
#     cross_featrue_inputs = tf.placeholder(tf.float32, [None, cross_dim], name='cross_featrue_inputs')
    label = tf.placeholder(tf.float32, [None, 1], name='label')
#     categorical_inputs = tf.placeholder(tf.int32, [None, len(CATEGORICAL_COLUMNS)], name='categorical_inputs')
    
    return embed_dict, numerical_inputs, label

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Never-worked' ' Self-emp-inc' ' Without-pay' nan]
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Never-worked' ' Self-emp-inc' ' Without-pay']


In [5]:
#定义超参数
learning_rate = 0.001

batch_size = 128
max_steps = 30000
epoch = 5
##其他的正则化率、学习率衰减率等....暂不使用


In [6]:
##初始化类别数据的embedding层

###这里初始化有错误

def init_embedding(embed_dict):
    cate_embed_list = []
    with tf.name_scope('categorical_embedding'):
        for key_name, item in embed_dict.items():
            embed_matrix = tf.Variable(tf.random_normal([item['shape'], item['dims']], stddev=1.0, mean=0.0), name='{}_embed_matrix'.format(key_name))
            ##tf.nn.embedding_lookup 查找矩阵的序号对应的向量， 所以embedding_lookup的维度不会发生改变
            ##这里的tf.nn.embedding_lookup不是矩阵乘法，所以维度不会发生变化...
            embed_layer = tf.nn.embedding_lookup(embed_matrix, item['holder'], name='{}_embed_layer'.format(key_name))  ## shape:item.shape[0], embed_dim
            embed_layer = tf.reduce_sum(embed_layer, axis=1, keep_dims=True) ## shape: 1, embed_dim
            embed_layer = tf.squeeze(embed_layer,axis=1) ## shape:embed_dim,
            cate_embed_list.append(embed_layer)
#         embed_matrix = tf.Variable(tf.random_normal([42, 16], stddev=1.0, mean=0.0), name='cate_embed_matrix')
#         embed_layer = tf.nn.embedding_lookup(embed_matrix, categorical_inputs, name='cate_embed_layer') ##shape=1x16
    return cate_embed_list

In [7]:
##将所有特征组合起来
def deep_net(categorical_featrue, numerical_featrue):
#     numerical_featrue = tf.reshape(numerical_featrue, [-1, 8])
    print(numerical_featrue)
#     numerical_featrue = tf.layers.dense(numerical_featrue, 8, activation=tf.nn.relu, name='numerical_featrue_fc')
#     numerical_featrue = tf.reshape(numerical_featrue, [-1, 8, 16])
    print(categorical_featrue)
#     categorical_featrue.append(numerical_featrue)
    all_featrue_layer = tf.concat(categorical_featrue + [numerical_featrue], axis=-1, name='all_featrue_combine')
    print(all_featrue_layer)
#     all_featrue_layer_relu = tf.nn.relu(all_featrue_layer)
    
    ##三层全连接
    fc1 = tf.layers.dense(all_featrue_layer, 64, activation=tf.nn.relu, name='all_featrue_fc1')
    fc2 = tf.layers.dense(fc1, 16, activation=tf.nn.relu, name='all_featrue_fc2')
#     fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu, name='all_featrue_fc3')
    return fc2

In [8]:
def combine_log_out(deep_fc, wide):
    wide_and_deep_combine = tf.concat([deep_fc, wide], -1, name='wide_and_deep_combine')
    wide_and_deep_out = tf.layers.dense(wide_and_deep_combine, 1, activation=tf.nn.sigmoid, name='wide_and_deep_out')
    
    return wide_and_deep_out
    

In [80]:
##构建计算图
tf.reset_default_graph()
train_graph = tf.Graph()
with tf.Session(graph=train_graph) as sess:
    data_dict = featrue_preprocessing()
    data = data_dict.get('data')
    embed_dict, numerical_inputs, label = variable_init(data)
    
    cate_embed_layer = init_embedding(embed_dict)
    
    deep_fc = deep_net(categorical_featrue=cate_embed_layer, numerical_featrue=numerical_inputs)
    print(deep_fc)
#     print(cross_featrue_inputs)
    with tf.name_scope('logloss'):
#         cross_featrue_layer = tf.layers.dense(cross_featrue_inputs, 16, activation=tf.nn.relu, name='cross_featrue_fc')
#         print(cross_featrue_layer)
#         cross_featrue_layer = tf.squeeze(cross_featrue_layer, axis=1)
#         wide_and_deep_combine = tf.concat([deep_fc, cross_featrue_layer], -1, name='wide_and_deep_combine')
        wide_and_deep_out = tf.layers.dense(deep_fc, 1, name='wide_and_deep_out')
#         out_label = tf.reduce_mean(wide_and_deep_out, axis=1)
#         out_label = tf.cast(out_label, tf.int32)
#         out_label = tf.squeeze(out_label)
        print(wide_and_deep_out)
        print(label)
        
    cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=wide_and_deep_out)
    loss = tf.reduce_mean(cost)
    
        ##优化器
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
#     gradients = optimizer.compute_gradients(loss)
#     train_op = optimizer.apply_gradients(gradients, global_step=global_step)
    y_ = tf.nn.sigmoid(wide_and_deep_out, name='sigmoid_out')
    label_acc = tf.squeeze(label, 1)
    print(label_acc)
    y_acc = tf.squeeze(y_, 1)
    print(y_acc)
    crorent_prediction = tf.equal(label_acc, y_acc)

    accuracy = tf.reduce_mean(tf.cast(crorent_prediction, tf.float32))

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Never-worked' ' Self-emp-inc' ' Without-pay' nan]
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Never-worked' ' Self-emp-inc' ' Without-pay']
Tensor("numerical_inputs:0", shape=(?, 5), dtype=float32)
[<tf.Tensor 'categorical_embedding/Squeeze:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_1:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_2:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_3:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_4:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_5:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_6:0' shape=(?, 16) dtype=float32>, <tf.Tensor 'categorical_embedding/Squeeze_7:0' shape=(?, 16) dtype=float32>]
Tensor("all_featrue_combine:0", shape=(?, 133), dtype=float32)
Tensor("all_feat

In [20]:
def get_batch(X, y, batch_size):
    for start in range(0, len(X), batch_size):
        end = min(start + batch_size, len(X))
        yield X[start:end], y[start: end]

In [93]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import numpy as np
import datetime

kf = StratifiedKFold(n_splits=5, shuffle=True)
# losses = {'train': [], 'test': []}
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    for epoch_i in range(10):
        ###将数据集分成训练集和测试集，随机种子不固定
        train_X, test_X = train_test_split(data_dict['data'], test_size=0.2, random_state=0)
#         print(train_X[CATEGORICAL_COLUMNS])
        train_X_cate = train_X[CATEGORICAL_COLUMNS].to_numpy()
        test_X_cate = test_X[CATEGORICAL_COLUMNS].to_numpy()
        train_X_num = train_X[NUMBER_COLUMNS].to_numpy()
        test_X_num = test_X[NUMBER_COLUMNS].to_numpy()
        train_y = train_X['label'].to_numpy()
        test_y = test_X['label'].to_numpy()
#         train_cate_ploy = train_X[['poly_featrue_{}'.format(i) for i in range(37)]].to_numpy()
#         test_cate_ploy = test_X[['poly_featrue_{}'.format(i) for i in range(37)]].to_numpy()
        
        
        train_batches = get_batch(train_X_cate, train_X_num, batch_size)
        test_batches = get_batch(test_X_cate, test_X_num, batch_size)
        train_label_batches = get_batch(train_X_cate, train_y, batch_size)
#         train_cate_batches = get_batch(train_cate_ploy, train_y, batch_size)
        test_label_batches = get_batch(test_X_cate, test_y, batch_size)
#         test_cate_batches = get_batch(test_cate_ploy, test_y, batch_size)
   
        for batch_i in range(len(train_X_cate) // batch_size):
            train_cate, train_num = next(train_batches)
#             test_cate, test_num = next(test_batches)
            _, train_label = next(train_label_batches)
#             train_poly, tets_poly = next(train_cate_batches)
#             for cate in CATEGORICAL_COLUMNS:
            train_label = train_label.reshape(-1, 1)
#             print(train_label.shape)
            feed_dict = {
                numerical_inputs: train_num.reshape(-1, 5),
                label: train_label,
#                 cross_featrue_inputs: train_poly,
            }
            cate_feed = {
                embed_dict[name]['holder']: np.reshape(train_cate[:, idx],[batch_size, 1]) for idx, name in enumerate(CATEGORICAL_COLUMNS)
            }
            feed_dict.update(cate_feed)
            step, train_loss, _  = sess.run([global_step, loss, train_op], feed_dict=feed_dict)
#             print(crorent_prediction__)
            if (epoch_i * (len(train_X_cate) // batch_size) + batch_i) % 20 == 0:
                time_str = datetime.datetime.now().isoformat()
                print('{:>3} Batch {:>4}/{} train_loss = {:.3f}'.format(time_str, batch_i, (len(train_X_cate) // batch_size), train_loss))
        for batch_i in range(len(test_X_cate) // batch_size):
            test_cate, test_num = next(test_batches)
#             test_cate, test_num = next(test_batches)
            _, test_label = next(test_label_batches)
#             print(test_label)
#             test_poly, _ = next(test_cate_batches)
#             for cate in CATEGORICAL_COLUMNS:
            test_label = test_label.reshape(-1, 1)
            feed_dict = {
                numerical_inputs: np.array(test_num).reshape(-1, 5),
                label: test_label,
#                 cross_featrue_inputs: test_poly,
            }
            feed_dict.update({
                embed_dict[name]['holder']: np.reshape(test_cate[:, idx],[batch_size, 1]) for idx, name in enumerate(CATEGORICAL_COLUMNS)
            })
#             step, test_loss, _ = sess.run([global_step, loss, train_op], feed_dict=feed_dict)
            y__ = sess.run(y_acc, feed_dict=feed_dict)
#             print('pred y:\n', [1 if i > 0.5 else 0 for i in y__])
#             print('vali label:\n', test_label.flatten())
            pred_y = [1 if i > 0.5 else 0 for i in y__]
            crorent_prediction = tf.equal(test_label.flatten(), pred_y)

            accuracy__ = tf.reduce_mean(tf.cast(crorent_prediction, tf.float32))
            test_acc = sess.run(accuracy__)
#             print(sess.run(accuracy__))
#             break
            if (epoch_i * (len(test_X_cate) // batch_size) + batch_i) % 20 == 0:
                time_str = datetime.datetime.now().isoformat()
                print('{:>3} Batch {:>4}/{} accuracy = {:.2f}'.format(time_str, batch_i, (len(test_X_cate) // batch_size), test_acc))


2020-07-06T15:17:45.830366 Batch    0/305 train_loss = 35.191
2020-07-06T15:17:45.861726 Batch   20/305 train_loss = 4.249
2020-07-06T15:17:45.890488 Batch   40/305 train_loss = 0.691
2020-07-06T15:17:45.918894 Batch   60/305 train_loss = 0.375
2020-07-06T15:17:45.946573 Batch   80/305 train_loss = 0.420
2020-07-06T15:17:45.974415 Batch  100/305 train_loss = 0.319
2020-07-06T15:17:46.000895 Batch  120/305 train_loss = 0.337
2020-07-06T15:17:46.030886 Batch  140/305 train_loss = 0.365
2020-07-06T15:17:46.063170 Batch  160/305 train_loss = 1.700
2020-07-06T15:17:46.094384 Batch  180/305 train_loss = 0.295
2020-07-06T15:17:46.123881 Batch  200/305 train_loss = 0.413
2020-07-06T15:17:46.152633 Batch  220/305 train_loss = 0.417
2020-07-06T15:17:46.182957 Batch  240/305 train_loss = 0.565
2020-07-06T15:17:46.209679 Batch  260/305 train_loss = 0.433
2020-07-06T15:17:46.236583 Batch  280/305 train_loss = 0.854
2020-07-06T15:17:46.263011 Batch  300/305 train_loss = 0.515
2020-07-06T15:17:46.803

2020-07-06T15:21:16.522819 Batch  165/305 train_loss = 0.473
2020-07-06T15:21:16.549743 Batch  185/305 train_loss = 0.318
2020-07-06T15:21:16.577327 Batch  205/305 train_loss = 0.325
2020-07-06T15:21:16.604657 Batch  225/305 train_loss = 0.646
2020-07-06T15:21:16.632688 Batch  245/305 train_loss = 0.410
2020-07-06T15:21:16.664269 Batch  265/305 train_loss = 0.505
2020-07-06T15:21:16.695715 Batch  285/305 train_loss = 0.392
2020-07-06T15:21:20.797417 Batch    8/76 accuracy = 0.84
2020-07-06T15:21:30.223374 Batch   28/76 accuracy = 0.84
2020-07-06T15:21:39.351429 Batch   48/76 accuracy = 0.86
2020-07-06T15:21:48.784305 Batch   68/76 accuracy = 0.91
2020-07-06T15:21:52.093030 Batch    0/305 train_loss = 0.379
2020-07-06T15:21:52.121549 Batch   20/305 train_loss = 0.442
2020-07-06T15:21:52.148225 Batch   40/305 train_loss = 0.332
2020-07-06T15:21:52.175556 Batch   60/305 train_loss = 0.399
2020-07-06T15:21:52.201708 Batch   80/305 train_loss = 0.369
2020-07-06T15:21:52.229012 Batch  100/30

In [250]:
{embed_dict[name]['holder']: data_dict['train_X_cate'][:, idx] for idx, name in enumerate(CATEGORICAL_COLUMNS)}

{<tf.Tensor 'workclass_inputs:0' shape=(?, 1) dtype=int32>: array([6, 5, 3, ..., 3, 3, 4]),
 <tf.Tensor 'education_inputs:0' shape=(?, 1) dtype=int32>: array([ 9,  9, 11, ..., 11, 11, 11]),
 <tf.Tensor 'marital_status_inputs:0' shape=(?, 1) dtype=int32>: array([4, 2, 0, ..., 6, 4, 2]),
 <tf.Tensor 'occupation_inputs:0' shape=(?, 1) dtype=int32>: array([0, 3, 5, ..., 0, 0, 3]),
 <tf.Tensor 'relationship_inputs:0' shape=(?, 1) dtype=int32>: array([1, 0, 1, ..., 4, 3, 5]),
 <tf.Tensor 'race_inputs:0' shape=(?, 1) dtype=int32>: array([4, 4, 4, ..., 4, 4, 4]),
 <tf.Tensor 'gender_inputs:0' shape=(?, 1) dtype=int32>: array([1, 1, 1, ..., 0, 1, 0]),
 <tf.Tensor 'native_country_inputs:0' shape=(?, 1) dtype=int32>: array([39, 39, 39, ..., 39, 39, 39])}

In [324]:
%matplotlib inline

In [297]:
data = pd.concat([data, pd.DataFrame(np.vstack([data_dict['test_cate_poly'], data_dict['train_cate_poly']]), columns=['poly_featrue_{}'.format(i) for i in range(37)] )] )

In [84]:
test_label.flatten()

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0])

In [305]:
train_X

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,...,poly_featrue_27,poly_featrue_28,poly_featrue_29,poly_featrue_30,poly_featrue_31,poly_featrue_32,poly_featrue_33,poly_featrue_34,poly_featrue_35,poly_featrue_36
5943,34,3.0,204461.0,9.0,13.0,2.0,10.0,0.0,4.0,1.0,...,,,,,,,,,,
17182,57,3.0,206343.0,11.0,9.0,5.0,6.0,1.0,4.0,1.0,...,,,,,,,,,,
30104,55,3.0,359972.0,9.0,13.0,2.0,3.0,0.0,4.0,1.0,...,,,,,,,,,,
20646,,,,,,,,,,,...,21.0,28.0,7.0,273.0,12.0,3.0,117.0,4.0,156.0,39.0
17186,,,,,,,,,,,...,3.0,12.0,0.0,117.0,4.0,0.0,39.0,0.0,156.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,65,2.0,404601.0,11.0,9.0,2.0,7.0,0.0,4.0,1.0,...,,,,,,,,,,
45892,61,2.0,244856.0,9.0,13.0,2.0,7.0,0.0,4.0,1.0,...,,,,,,,,,,
42614,41,5.0,344624.0,15.0,10.0,2.0,10.0,0.0,4.0,1.0,...,,,,,,,,,,
43568,47,5.0,104489.0,9.0,13.0,2.0,12.0,0.0,4.0,1.0,...,,,,,,,,,,
