In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv('./data/criteo_sampled_data.csv')
print(data.shape)

(600000, 40)


In [4]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 40 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   label   600000 non-null  int64  
 1   I1      346447 non-null  float64
 2   I2      600000 non-null  int64  
 3   I3      457859 non-null  float64
 4   I4      445879 non-null  float64
 5   I5      583403 non-null  float64
 6   I6      465077 non-null  float64
 7   I7      575030 non-null  float64
 8   I8      599558 non-null  float64
 9   I9      575030 non-null  float64
 10  I10     346447 non-null  float64
 11  I11     575030 non-null  float64
 12  I12     138927 non-null  float64
 13  I13     445879 non-null  float64
 14  C1      600000 non-null  object 
 15  C2      600000 non-null  object 
 16  C3      577541 non-null  object 
 17  C4      577541 non-null  object 
 18  C5      600000 non-null  object 
 19  C6      522807 non-null  object 
 20  C7      600000 non-null  object 
 21  C8      60

In [6]:
cols = data.columns.values

# 数据预处理
## 将数值型特征和类别型特征分开

In [7]:
dense_feats = [f for f in cols if 'I' in f]
sparse_feats = [f for f in cols if 'C' in f]


### 开始处理特征

In [8]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for i in feats:
        ## 进行log转换消除长尾效应
        d[i] = d[i].apply(lambda x: np.log(x + 1) if x > -1 else -1)
    return d

In [9]:
data_dense = process_dense_feats(data, dense_feats)

In [10]:
from sklearn.preprocessing import LabelEncoder

def process_sparse_feats(data, feats):
    d = data.copy()
    ##将缺失值填充为-1
    d = d[feats].fillna("-1")
    for i in feats:
        le = LabelEncoder()
        d[i] = le.fit_transform(d[i])
    return d

In [11]:
data_sparse = process_sparse_feats(data, sparse_feats)

In [12]:
total_data = pd.concat([data_dense, data_sparse], axis=1)
total_data['label'] = data['label']

# 构建模型
## 构建FM
### 一阶特征

In [13]:
dense_inputs = []
from tensorflow.keras import layers as Kl
##这里使用keras，对应tf.placeholder
for i in dense_feats:
    ##这里对应tf的tf.placeholder(tf.float32, [None, 1], name=i)
    _inputs = Kl.Input([1], name=i)
    dense_inputs.append(_inputs)

In [14]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(?, 1) dtype=float32>]

#### 这里这样写我不是很喜欢....应该遵循 机器学习设计五要素, 数据处理、模型构建(先定义超参数、数据接收器、特征处理(embedding等)、构建计算图、训练、保存)、模型kpi(选取对应的损失函数)、优化方法、评估标准

In [15]:
##对应tf.concat(dense_inputs, axis=1, name='inputs_combine')
concat_dense_inputs = Kl.Concatenate(axis=1)(dense_inputs) ##shape: ?, 1, 13

fst_order_dense_layer = Kl.Dense(1)(concat_dense_inputs)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


### sparse特征

In [16]:
sparse_inputs = []
for i in sparse_feats:
    _inputs = Kl.Input([1], name=i)
    sparse_inputs.append(_inputs)

In [17]:
sparse_embed = []
for i, _inputs in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = data[f].nunique()
    _embed = Kl.Flatten()(Kl.Embedding(voc_size+1, 1, embeddings_regularizer=tf.keras.regularizers.l2(0.5))(_inputs))
    sparse_embed.append(_embed)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
sparse_embed

[<tf.Tensor 'flatten/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_1/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_2/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_3/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_4/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_5/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_6/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_7/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_8/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_9/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_10/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_11/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_12/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_13/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_14/Reshape:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'flatten_15/Reshape:0' shape=(?, 1) dtyp

In [19]:
fst_order_sparse_layer = Kl.Add()(sparse_embed)

In [20]:
fst_order_sparse_layer

<tf.Tensor 'add/add_24:0' shape=(?, 1) dtype=float32>

## linear部分合并

In [21]:
linear_part = Kl.Add()([fst_order_dense_layer, fst_order_sparse_layer])

## 二阶特征交叉

In [23]:
## embedding size
embed_dims = 8

In [25]:
##只考虑sparse的二阶交叉
sparse_kd_embed = []
for i, _input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = data[f].nunique()
    _embed = Kl.Embedding(voc_size + 1, embed_dims, embeddings_regularizer=tf.keras.regularizers.l2(0.7))(_input)
    sparse_kd_embed.append(_embed)

In [26]:
sparse_kd_embed

[<tf.Tensor 'embedding_26/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_27/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_28/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_29/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_30/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_31/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_32/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_33/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_34/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_35/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_36/embedding_lookup/Identity_1:0' shape=(?, 1, 8) dtype=float32>,
 <tf.Tensor 'embeddin

#### 通过交叉项求解可以将<Vi, Vj>xi*xj --> 1/2 * ((Vi,f * xi) ** 2 - (Vi,f ** 2 + xi ** 2))


In [27]:
##1.将所有sparse的embedding向量拼接起来，得到（n, embed_dims）的矩阵，其中n为特征数
concat_sparse_kd_embed = Kl.Concatenate(axis=1)(sparse_kd_embed) #shape: ?, n, embed_dims

In [68]:
concat_sparse_kd_embed

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 26, 8) dtype=float32>

In [28]:
#2.先求和再平方
sum_kd_embed = tf.reduce_sum(concat_sparse_kd_embed, axis=1)
square_sum_kd_embed = tf.square(sum_kd_embed)

In [31]:
##先平方再求和
square_kd_embed = tf.square(concat_sparse_kd_embed)
square_sum_kd_embed = tf.reduce_sum(square_kd_embed, axis=1)

In [40]:
##相减乘以0.5
sub = tf.subtract(square_sum_kd_embed, square_sum_kd_embed)

sub_half = tf.constant(0.5)
sub = tf.multiply(sub, sub_half)
snd_order_sparse_layer = tf.reduce_sum(sub, axis=1, keep_dims=True)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [41]:
snd_order_sparse_layer

<tf.Tensor 'Sum_3:0' shape=(?, 1) dtype=float32>

### DNN部分

In [43]:
##使用FM二阶交叉的embedding矩阵, 创建共享矩阵，加快迭代速率
flatten_sparse_embed = Kl.Flatten()(concat_sparse_kd_embed) ## shape:?, n,embed_dims

In [60]:
fc_layer_1 = Kl.Dense(256, activation='relu')(flatten_sparse_embed) ## shape: ?, 256
fc_layer_2 = Kl.Dense(256, activation='relu')(fc_layer_1)
fc_layer_3 = Kl.Dense(256, activation='relu')(fc_layer_2)

In [61]:
fc_layer_output = Kl.Dense(1)(fc_layer_3) ## 这里暂时不激活

### 输出结果 

In [63]:
output_layer = Kl.Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
output_layer = Kl.Activation('sigmoid')(output_layer)

In [47]:
###编译模型
##原生tf的计算图构建
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [64]:
model = Model(dense_inputs + sparse_inputs, output_layer)
plot_model(model, 'deepfm.png')

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [65]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
____________________________________________________________________________________________

In [66]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_crossentropy', tf.keras.metrics.AUC(name='auc')])

In [51]:
from tensorflow.keras.callbacks import TensorBoard

In [52]:
tbCallBack = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_grads=True,
                         embeddings_freq=0,
                         embeddings_layer_names=None,
                         embeddings_metadata=None
                        )

In [53]:
train_data = total_data.loc[: 500000-1]
valid_data = total_data.loc[500000:]

In [54]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

In [55]:
train_label = [train_data['label'].values]

In [56]:
val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

In [57]:
val_label = [valid_data['label'].values]

In [67]:
model.fit(train_dense_x + train_sparse_x, train_label, epochs=5, batch_size=256, validation_data=(val_dense_x + val_sparse_x, val_label),
         callbacks=[tbCallBack])

Train on 500000 samples, validate on 100000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc5f80e5310>

In [59]:
!tensorboard --logdir ./logs

TensorBoard 1.15.0 at http://192.168.0.105:6006/ (Press CTRL+C to quit)
^C


In [70]:
plot_model(model, to_file='./deep_fm.png')

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [4]:
BATCH_START = 0
TIME_STEPS = 20
BATCH_SIZE = 32
import numpy as np 
def get__batch():
    global BATCH_START, TIME_STEPS
    # xs shape (50batch, 20steps)
    xs = np.arange(BATCH_START, BATCH_START+TIME_STEPS*BATCH_SIZE).reshape((BATCH_SIZE, TIME_STEPS)) / (10*np.pi)
    seq = np.sin(xs)
    res = np.cos(xs)
    BATCH_START += TIME_STEPS
    # returned seq, res and xs: shape (batch, step, input)
    return [seq[:, :, np.newaxis], res[:, :, np.newaxis], xs]

In [7]:
res = get__batch()

(32, 20, 1)