# Description
这里使用TensorFlow 1.5实现用于CTR预测的DeepFM，数据集选用的是kaggle上的criteo数据集。数据集的详细介绍可以参看[FM代码实现](https://github.com/Andr-Robot/ML_Model/blob/main/FM.ipynb)。

# DeepFM
DeepFM模型包含FM和DNN两部分，FM模型可以抽取low-order特征，DNN可以抽取high-order特征。相比于Wide & Deep模型无需人工特征工程。由于输入仅为原始特征，而且FM和DNN共享输入向量特征，DeepFM模型训练速度很快。    

<div align=center><img src="https://raw.githubusercontent.com/Andr-Robot/iMarkdownPhotos/master/Res/ml/deepfm_architecture.png" width="50%;" style="float:center"/></div>

In [13]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
# dense特征空值用0填充，并取对数， sparse特征空值用'-1'填充
def process_feat(data, dense_feats, sparse_feats):
    df = data.copy()
    # dense
    df[dense_feats] = df[dense_feats].fillna(0.0)
    for f in tqdm(dense_feats):
        df[f] = df[f].apply(lambda x: np.log(1 + x) if x > -1 else -1)
    # sparse
    df[sparse_feats] = df[sparse_feats].fillna('-1')
    return df

In [3]:
# 数据加载
file = './dataset/criteo_sampled_data.csv'
data = pd.read_csv(file, sep=',')
# dense 特征开头是I, sparse特征开头是C， label是标签
cols = data.columns.values
dense_feats = [f for f in cols if f[0] == 'I']
sparse_feats = [f for f in cols if f[0] == 'C']
ignore_feats = ['label']
# 数据预处理
data_new = process_feat(data, dense_feats, sparse_feats)
# 切分训练集和验证集
train, test = train_test_split(data_new, test_size=0.2, random_state=1)

100%|██████████| 13/13 [00:09<00:00,  1.32it/s]


In [4]:
# 遍历数据获取对应feature_dict，total_feature
def get_feature_dict(data, ignore_feats, dense_feats):
    feature_dict = {}
    total_feature = 0
    for col in tqdm(data.columns):
        if col in ignore_feats:
            continue
        elif col in dense_feats:
            feature_dict[col] = total_feature
            total_feature += 1
        else:
            unique_val = data[col].unique()
            feature_dict[col] = dict(
                zip(unique_val,
                    range(total_feature,
                          len(unique_val) + total_feature)))
            total_feature += len(unique_val)
    return feature_dict, total_feature

In [5]:
# 遍历数据获取对应feature_dict，total_feature
def get_feature_dict(data, ignore_feats, dense_feats):
    feature_dict = {}
    total_feature = 0
    for col in tqdm(data.columns):
        if col in ignore_feats:
            continue
        elif col in dense_feats:
            feature_dict[col] = total_feature
            total_feature += 1
        else:
            unique_val = data[col].unique()
            feature_dict[col] = dict(
                zip(unique_val,
                    range(total_feature,
                          len(unique_val) + total_feature)))
            total_feature += len(unique_val)
    return feature_dict, total_feature

In [6]:
# 对原始特征进行转换，便于后续训练
def data_tran(data, feature_dict, ignore_feats, dense_feats):
    labels = data['label']
    # 这里存储的是每个值对应在feature_dict的idx，将每一条数据转换为对应的特征索引
    feature_index = data.copy()
    # 这里存储的是每个值，将每一条数据转换为对应的特征值
    feature_value = data.copy()
    for col in tqdm(feature_index.columns):
        if col in ignore_feats:
            feature_index.drop(col, axis=1, inplace=True)
            feature_value.drop(col, axis=1, inplace=True)
        elif col in dense_feats:
            feature_index[col] = feature_dict[col]
        else:
            feature_index[col] = feature_index[col].map(feature_dict[col])
            feature_value[col] = 1
    return feature_index, feature_value, labels

In [7]:
feature_dict, total_feature = get_feature_dict(data_new, ignore_feats,
                                               dense_feats)
print('total_feature:', total_feature)
print('feature_dict size:', len(feature_dict))
# 产出用于训练的数据
train_feature_index, train_feature_value, train_labels = data_tran(
    train, feature_dict, ignore_feats, dense_feats)
test_feature_index, test_feature_value, test_labels = data_tran(
    test, feature_dict, ignore_feats, dense_feats)

100%|██████████| 40/40 [00:01<00:00, 35.97it/s] 


total_feature: 885697
feature_dict size: 39


100%|██████████| 40/40 [00:08<00:00,  4.48it/s]
100%|██████████| 40/40 [00:03<00:00, 11.41it/s]


In [8]:
deepfm_params = {
    'embedding_size': 8,
    'batch_size': 2000,
    'learning_rate': 0.00001,
    'epoch': 10,
    'optimizer': 'adam',
    'dnn_dropout': 0.5,
    'hidden_units': [256, 128, 64]
}
deepfm_params['feature_size'] = total_feature
deepfm_params['field_size'] = len(train_feature_index.columns)

In [14]:
# 开始构建模型
tf.reset_default_graph()  # 重置网络结构
# 定义模型输入
# 训练模型的输入有三个，分别是刚才转换得到的特征索引和特征值，以及label：
feat_index = tf.placeholder(tf.int32,
                            shape=[None, deepfm_params['field_size']],
                            name='feat_index')
feat_value = tf.placeholder(tf.float32,
                            shape=[None, deepfm_params['field_size']],
                            name='feat_value')
labels = tf.placeholder(tf.int32, shape=[None], name='labels')
training = tf.placeholder_with_default(False, shape=[], name='training')

'''FM part'''
# tf fm weights
weights = dict()
weights_initializer = tf.glorot_normal_initializer()
bias_initializer = tf.constant_initializer(0.0)
weights["feature_embeddings"] = tf.get_variable(
    name='weights',
    dtype=tf.float32,
    initializer=weights_initializer,
    regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5),
    shape=[deepfm_params['feature_size'], deepfm_params['embedding_size']])
weights["weights_first_order"] = tf.get_variable(
    name='vectors',
    dtype=tf.float32,
    initializer=weights_initializer,
    regularizer=tf.contrib.layers.l2_regularizer(1e-5),
    shape=[deepfm_params['field_size'], 1])
weights["fm_bias"] = tf.get_variable(name='bias',
                                     dtype=tf.float32,
                                     initializer=bias_initializer,
                                     shape=[1])
embeddings = tf.nn.embedding_lookup(weights["feature_embeddings"],
                                    feat_index)  # shape=(?, 39, 8)
bias = weights['fm_bias']
#build function
##first order
first_order = tf.matmul(feat_value,
                        weights["weights_first_order"])  # shape=(?, 1)
##second order
### feature * embeddings
reshaped_feat_value = tf.reshape(feat_value,
                                 shape=[-1, deepfm_params['field_size'], 1])
f_e_m = tf.multiply(
    reshaped_feat_value,
    embeddings)  # multiply这个函数实现的是元素级别的相乘，也就是两个相乘的数元素各自相乘，而不是矩阵乘法
###  square(sum(feature * embedding))
f_e_m_sum = tf.reduce_sum(f_e_m, 1)
f_e_m_sum_square = tf.square(f_e_m_sum)
###  sum(square(feature * embedding))
f_e_m_square = tf.square(f_e_m)
f_e_m_square_sum = tf.reduce_sum(f_e_m_square, 1)
second_order = f_e_m_sum_square - f_e_m_square_sum
second_order = 0.5 * tf.reduce_sum(second_order, 1, keepdims=True)
##FM part objective function
fm_logits = second_order + first_order + bias

'''DNN part'''
# 这里相当于是将(?, 39, 8)中的第三维展开变成(?, 39 * 8)
x = tf.reshape(embeddings, shape=[-1, deepfm_params['field_size'] * deepfm_params['embedding_size']])
for i, hidden_unit in enumerate(deepfm_params['hidden_units']):
    x = tf.layers.dense(inputs=x, units=hidden_unit, activation=tf.nn.relu, name='hidden_%d' % i)
x = tf.layers.dropout(inputs=x, rate=deepfm_params['dnn_dropout'], training=training)
##Deep part objective function
deep_logits = tf.layers.dense(inputs=x, units=1, activation=None)

'''DeepFM output'''
logits = tf.add(fm_logits, deep_logits)
predicts = tf.sigmoid(logits)

##loss function
new_labels = tf.cast(tf.reshape(labels, shape=[-1, 1]), dtype=tf.float32)
sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                       labels=new_labels)
sigmoid_loss = tf.reduce_mean(sigmoid_loss)
l2_loss = tf.losses.get_regularization_loss()
loss = sigmoid_loss + l2_loss

# train op
if deepfm_params['optimizer'] == 'adagrad':
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=deepfm_params['learning_rate'],
        initial_accumulator_value=1e-8)
elif deepfm_params['optimizer'] == 'adam':
    optimizer = tf.train.AdamOptimizer(
        learning_rate=deepfm_params['learning_rate'])
else:
    raise Exception('unknown optimizer', deepfm_params['optimizer'])
train_op = optimizer.minimize(loss)

# accuracy
one_tensor = tf.ones_like(predicts)
neg_predicts = tf.subtract(one_tensor, predicts)
prediction = tf.concat([neg_predicts, predicts], axis=1)
# new_labels = tf.cast(tf.reshape(labels, shape=[-1]), dtype=tf.int32)
# 如果labels的输入shape是[None, 1]则需要转成[None,]，这样才能供in_top_k使用
correct_prediction = tf.nn.in_top_k(prediction, labels, 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [15]:
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
# Start training
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(deepfm_params['epoch']):
        avg_cost = 0.
        avg_acc = 0.
        total_batch = int(train.shape[0] / deepfm_params['batch_size'])
        # Loop over all batches
        for i in range(total_batch):
            start_idx = i * deepfm_params['batch_size']
            end_idx = (i + 1) * deepfm_params['batch_size']
            batch_index = train_feature_index[start_idx:end_idx]
            batch_value = train_feature_value[start_idx:end_idx]
            batch_labels = train_labels[start_idx:end_idx]
            # Fit training using batch data
            _, c, acc = sess.run(
                [train_op, loss, accuracy],
                feed_dict={
                    feat_index: batch_index,
                    feat_value: batch_value,
                    labels: batch_labels,
                    training: True
                })
            # Compute average loss
            avg_cost += c / total_batch
            avg_acc += acc / total_batch
        # Display logs per epoch step
        if (epoch + 1) % 1 == 0:
            vloss, pred1, pred2, cprediction, vacc = sess.run(
                [loss, predicts, prediction, correct_prediction, accuracy],
                feed_dict={
                    feat_index: test_feature_index,
                    feat_value: test_feature_value,
                    labels: test_labels
                })
            print('Epoch:', '%04d' % (epoch + 1), 'cost=',
                  '{:.9f}'.format(avg_cost), 'acc=', '{:.9f}'.format(avg_acc),
                  'valid_loss=', '{:.9f}'.format(vloss), 'valid_acc=',
                  '{:.9f}'.format(vacc))

    print('Optimization Finished!')


Epoch: 0001 cost= 1.873662378 acc= 0.256718750 valid_loss= 1.780948043 valid_acc= 0.259458333
Epoch: 0002 cost= 1.642391616 acc= 0.259768750 valid_loss= 1.449141264 valid_acc= 0.268725008
Epoch: 0003 cost= 1.194329687 acc= 0.304004167 valid_loss= 0.915479779 valid_acc= 0.370358348
Epoch: 0004 cost= 0.771444572 acc= 0.515295834 valid_loss= 0.642312646 valid_acc= 0.652316689
Epoch: 0005 cost= 0.658318720 acc= 0.656402083 valid_loss= 0.607389033 valid_acc= 0.720274985
Epoch: 0006 cost= 0.641649251 acc= 0.683527084 valid_loss= 0.598297477 valid_acc= 0.727641642
Epoch: 0007 cost= 0.629298873 acc= 0.692437499 valid_loss= 0.589931548 valid_acc= 0.729099989
Epoch: 0008 cost= 0.616572655 acc= 0.699260417 valid_loss= 0.580828309 valid_acc= 0.731683314
Epoch: 0009 cost= 0.602857007 acc= 0.705931248 valid_loss= 0.571716487 valid_acc= 0.734591663
Epoch: 0010 cost= 0.587562498 acc= 0.714556253 valid_loss= 0.563073218 valid_acc= 0.737166643
Optimization Finished!
