In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2024-09-10 20:57:48.479364: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class SparseFeat:
    def __init__(self, name, vocabulary_size, embedding_dim):
        self.name = name
        self.vocabulary_size = vocabulary_size
        self.embedding_dim = embedding_dim

class DenseFeat:
    def __init__(self, name, dimension):
        self.name = name
        self.dimension = dimension

In [4]:
# 假设我们有以下用户和物品特征
user_feature_columns = [
    SparseFeat('user_id', 10000, embedding_dim=64),
    DenseFeat('age', 1)
]

item_feature_columns = [
    SparseFeat('item_id', 100000, embedding_dim=64)
]

In [9]:
# 构建输入层
def build_input_layers(feature_columns):
    input_layers = {}
    for feature in feature_columns:
        if isinstance(feature, SparseFeat):  # 假设 SparseFeat 是稀疏特征
            input_layers[feature.name] = layers.Input(shape=(1,), name=feature.name, dtype=tf.int32)
        elif isinstance(feature, DenseFeat):  # 假设 DenseFeat 是密集特征
            input_layers[feature.name] = layers.Input(shape=(feature.dimension,), name=feature.name, dtype=tf.float32)
    return input_layers

In [10]:
# 为给定的特征创建嵌入层
# 嵌入层通常用于将稀疏的特征（类别特征）转换为密集的向量表示
def build_embedding_layers(feature_columns):
    embedding_layers = {}
    for feature in feature_columns:
        if isinstance(feature, SparseFeat):  # 假设 SparseFeat 是稀疏特征
            # 创建一个嵌入层，将词汇表大小的输入映射到指定的嵌入维度
            embedding_layers[feature.name] = layers.Embedding(
                input_dim=feature.vocabulary_size + 1,  # 词汇表大小 + 1，以处理0索引
                output_dim=feature.embedding_dim,
                input_length=1,  # 输入长度为1，因为我们处理的是单个特征值
                name=f"{feature.name}_embedding"
            )
    return embedding_layers

In [13]:
# embedding_lookup函数
# 将输入的稀疏特征通过嵌入层转换为嵌入向量
# 通常用于将类别特征转换为密集的低维向量表示

def embedding_lookup(feature_names, input_layer_dict, embedding_layer_dict):
    """
    使用嵌入层查找特征的嵌入向量。
    
    参数:
    - feature_names: 要查找的特征名称列表。
    - input_layer_dict: 包含输入层的字典。
    - embedding_layer_dict: 包含嵌入层的字典。
    
    返回:
    - 嵌入向量的列表。
    """
    embedding_vectors = []
    for name in feature_names:
        if name in input_layer_dict:
            input_layer = input_layer_dict[name]
            embedding_layer = embedding_layer_dict[name]
            # 使用嵌入层查找输入层的嵌入向量
            embedding_vector = embedding_layer(input_layer)
            embedding_vectors.append(embedding_vector)
        else:
            raise ValueError(f"Feature '{name}' not found in input or embedding layers.")
    return embedding_vectors

In [15]:
# PoolingLayer通常用于将多个嵌入向量合并为一个
from tensorflow.keras.layers import Layer
class PoolingLayer(Layer):
    def __init__(self, **kwargs):
        super(PoolingLayer, self).__init__(**kwargs)
    
    def call(self, inputs):
        # 假设inputs是一个嵌入向量的列表
        # 使用平均池化来合并向量
        return tf.reduce_mean(inputs, axis=0)

    def compute_output_shape(self, input_shape):
        # 返回池化后的输出形状
        return input_shape[0]  # 假设所有输入具有相同的形状

    def get_config(self):
        config = super(PoolingLayer, self).get_config()
        return config

In [16]:
# CapsuleLayer是胶囊网络中的核心层
class CapsuleLayer(Layer):
    def __init__(self, input_units, output_units, max_len, k_max, **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.input_units = input_units
        self.output_units = output_units
        self.max_len = max_len
        self.k_max = k_max

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], self.output_units * self.max_len),
                                 initializer='random_normal',
                                 trainable=True)
        super(CapsuleLayer, self).build(input_shape)

    def call(self, inputs):
        # 这里只是一个简化示例，实际的胶囊层会更复杂
        inputs = tf.keras.layers.Reshape((self.max_len, self.input_units))(inputs)
        outputs = tf.keras.layers.Dot(axes=2)([inputs, self.W])
        return tf.keras.layers.Activation('sigmoid')(outputs)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_units)

    def get_config(self):
        config = super(CapsuleLayer, self).get_config()
        config.update({
            'input_units': self.input_units,
            'output_units': self.output_units,
            'max_len': self.max_len,
            'k_max': self.k_max
        })
        return config

In [17]:
# DNN（深度神经网络），由多个全连接层组成
class DNN(tf.keras.Model):
    def __init__(self, hidden_units, activation, reg, dropout, use_bn, output_activation, seed, **kwargs):
        super(DNN, self).__init__(**kwargs)
        self.hidden_layers = []
        for units in hidden_units:
            self.hidden_layers.append(tf.keras.layers.Dense(units,
                                                            activation=activation,
                                                            kernel_regularizer=tf.keras.regularizers.l2(reg),
                                                            bias_regularizer=tf.keras.regularizers.l2(reg),
                                                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed)))
            if use_bn:
                self.hidden_layers.append(tf.keras.layers.BatchNormalization())
            self.hidden_layers.append(tf.keras.layers.Dropout(dropout))
        self.output_layer = tf.keras.layers.Dense(1,
                                                  activation=output_activation,
                                                  kernel_initializer=tf.keras.initializers.glorot_uniform(seed))

    def call(self, inputs):
        x = inputs
        for layer in self.hidden_layers:
            x = layer(x)
        return self.output_layer(x)

In [18]:
# NoMask
# 用于处理序列掩码的自定义层或者函数
import tensorflow as tf

class NoMask(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(NoMask, self).__init__(**kwargs)
    
    def call(self, inputs):
        # 假设这个层只是简单地返回输入，不进行掩码处理
        return inputs

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = super(NoMask, self).get_config()
        return config

In [19]:
# combined_dnn_input
# 用于合并不同类型输入（如嵌入向量和密集特征）
def combined_dnn_input(embedding_inputs, dense_inputs):
    # 将嵌入向量和密集特征拼接在一起
    if embedding_inputs and dense_inputs:
        input_combined = tf.keras.layers.Concatenate()(embedding_inputs + dense_inputs)
    elif embedding_inputs:
        input_combined = tf.keras.layers.Concatenate()(embedding_inputs)
    elif dense_inputs:
        input_combined = tf.keras.layers.Concatenate()(dense_inputs)
    else:
        raise ValueError("No input provided for DNN.")
    return input_combined

In [20]:
# tile_user_otherfeat：
# 用于扩展用户特征以匹配胶囊网络输入维度
def tile_user_otherfeat(inputs, k_max):
    # 将用户其他特征重复k_max次以匹配胶囊网络的输入维度
    tiled_inputs = tf.tile(inputs, [1, k_max, 1])
    return tiled_inputs

In [21]:
# LabelAwareAttention层
# 根据输入的keys、query和可选地length来计算注意力权重

class LabelAwareAttention(Layer):
    def __init__(self, k_max, pow_p=1, **kwargs):
        self.k_max = k_max
        self.pow_p = pow_p
        super(LabelAwareAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.embedding_size = input_shape[0][-1]
        super(LabelAwareAttention, self).build(input_shape)

    def call(self, inputs, training=None, **kwargs):
        keys = inputs[0]
        query = inputs[1]
        weight = tf.reduce_sum(keys * query, axis=-1, keepdims=True)
        weight = tf.pow(weight, self.pow_p)  # [batch_size, k_max, 1]
        if len(inputs) == 3:
            k_user = tf.cast(tf.maximum(
                1.,
                tf.minimum(
                    tf.cast(self.k_max, dtype="float32"),
                    tf.math.log1p(tf.cast(inputs[2], dtype="float32")) / tf.math.log(2.)
                )
            ), dtype="int64")
            seq_mask = tf.transpose(tf.sequence_mask(k_user, self.k_max), [0, 2, 1])
            padding = tf.ones_like(seq_mask, dtype=tf.float32) * (-2 ** 32 + 1)
            weight = tf.where(seq_mask, weight, padding)
        weight = tf.nn.softmax(weight, name="weight")
        output = tf.reduce_sum(keys * weight, axis=1)
        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], self.embedding_size)

    def get_config(self):
        config = {'k_max': self.k_max, 'pow_p': self.pow_p}
        base_config = super(LabelAwareAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [22]:
# EmbeddingIndex
# 返回一个常数索引
class EmbeddingIndex(Layer):
    def __init__(self, index, **kwargs):
        self.index = index
        super(EmbeddingIndex, self).__init__(**kwargs)

    def build(self, input_shape):
        super(EmbeddingIndex, self).build(input_shape)

    def call(self, x, **kwargs):
        return tf.constant(self.index)

    def get_config(self):
        config = {'index': self.index}
        base_config = super(EmbeddingIndex, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [23]:
# get_item_embedding
# 获取物品的嵌入向量
def get_item_embedding(pooling_item_embedding_weight, item_input_layer):
    # 假设 pooling_item_embedding_weight 是一个嵌入层的输出
    # item_input_layer 是对应的输入层
    item_embedding = pooling_item_embedding_weight[item_input_layer]
    return item_embedding

In [24]:
# SampledSoftmaxLayer
class SampledSoftmaxLayer(Layer):
    def __init__(self, num_sampled, **kwargs):
        super(SampledSoftmaxLayer, self).__init__(**kwargs)
        self.num_sampled = num_sampled

    def build(self, input_shape):
        # 创建层的权重
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(input_shape[-1],),
                                 initializer='zeros',
                                 trainable=True)
        super(SampledSoftmaxLayer, self).build(input_shape)

    def call(self, inputs):
        # 这里只是一个简化示例，实际的采样softmax会更复杂
        logits = tf.matmul(inputs, self.W) + self.b
        sampled_logits = tf.nn.sampled_softmax(logits, self.num_sampled)
        return sampled_logits

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = super(SampledSoftmaxLayer, self).get_config()
        config.update({'num_sampled': self.num_sampled})
        return config

In [27]:
class VarLenSparseFeat:
    def __init__(self, sparsefeat, maxlen, combiner='mean', length_name=None, weight_name=None, weight_norm=True):
        self.sparsefeat = sparsefeat
        self.maxlen = maxlen
        self.combiner = combiner
        self.length_name = length_name
        self.weight_name = weight_name
        self.weight_norm = weight_norm

# 示例使用
# 假设我们有一个SparseFeat实例，名为sparse_feature
sparse_feature = SparseFeat('sparse_feature', 10000, 64)
varlen_feature = VarLenSparseFeat(sparse_feature, maxlen=10)

In [25]:
from tensorflow.keras.layers import Concatenate
def MIND(user_feature_columns, item_feature_columns, num_sampled=5, k_max=2,p=1.0, dynamic_k=False, user_dnn_hidden_units=(64,32),
         dnn_activation='relu',dnn_use_bn=False, reg_dnn=0, reg_embedding=1e-6, dnn_dropout=0, output_activation='linear', seed=1024):
    """
    user_feature_columns: 用户特征列
    item_feature_columns: 物品特征列
    num_sampled: 负采样的数量
    k_max: 胶囊网络中胶囊的最大数量
    p: Label-aware Attention中的指数参数
    user_dnn_hidden_units: 用户DNN层的隐藏单元
    dnn_use_bn: 是否在DNN层使用批量归一化
    reg_dnn: DNN层的正则化系数
    reg_embedding: 嵌入层的正则化系数
    dnn_dropout: DNN层的dropout率
    output_activation: 输出层的激活函数
    seed: 随机种子
    """
    
    # 参数检查
    # 目前只支持item_feature_columns为1的情况
    if len(item_feature_columns)>1:
        raise ValueError('Now MIND only support 1 item feature like item_id')
    
    # 获取物品特征配置
    # 从item_feature_columns中提取物品特征的名称、词汇表大小和嵌入维度
    # 获取item相关的配置参数
    item_feature_column=item_feature_columns[0]
    item_feature_name=item_feature_column.name
    item_vocabulary_size=item_feature_column.vocabulary_size
    item_embedding_dim=item_feature_column.embedding_dim
    
    behavior_feature_list=[item_feature_name]
    
    # 利用build_input_layers为用户和物品特征构建输入层
    # 为用户特征创建input层
    user_input_layer_dict=build_input_layers(user_feature_columns)
    item_input_layer_dict=build_input_layers(item_feature_columns)
    # 将input层转换成列表的形式作为model的输入
    user_input_layers=list(user_input_layer_dict.values())
    item_input_layers=list(item_input_layer_dict.values())
    
    # 特征筛选：筛选出用户特征中的稀疏特征、密集特征和变长稀疏特征
    # 筛选出特征中的sparse特征和dense特征，方便单独处理
    sparse_feature_columns=list(filter(lambda x: isinstance(x, SparseFeat), user_feature_columns)) if user_feature_columns else []
    dense_feature_columns=list(filter(lambda x: isinstance(x, DenseFeat), user_feature_columns)) if user_feature_columns else []
    varlen_feature_columns=list(filter(lambda x: isinstance(x, VarLenSparseFeat), user_feature_columns)) if user_feature_columns else []
    
    # 由于这个变长序列俩面只有历史点击文章，没有类别等，所以这里可以直接使用varlen_feature_columns
    # deepctr这里单独把点击文章放到了history_feature_columns
    seq_max_len=varlen_feature_columns[0].maxlen
    
    # 使用build_embedding_layers构建嵌入层
    embedding_layer_dict=build_embedding_layers(user_feature_columns+item_feature_columns)
    
    # 使用embedding_lookup函数获取行为特征和用户特征的嵌入向量
    # 获取当前的行为特征的embedding，这里面可能有多个类别特征，因此需要pooling
    query_embed_list=embedding_lookup(behavior_feature_list, item_input_layer_dict, embedding_layer_dict) # 长度为1
    # 获取行为序列(doc_id序列,hist_doc_id)对应的embedd，这里有可能有多个行为产生了行为序列，所以需要使用列表将其放在一起
    keys_embed_list=embedding_lookup([varlen_feature_columns[0].name], user_input_layer_dict, embedding_layer_dict) # 长度为1
    
    # 用户离散特征的输入层和embedding层拼接
    dnn_input_emb_list=embedding_lookup([col.name for col in sparse_feature_columns],user_input_layer_dict, embedding_layer_dict)
    
    # 获取dense
    dnn_dense_input=[]
    for fc in dense_feature_columns:
        if fc.name!='hist_len': # 连续特征不要这个
            dnn_dense_input.append(user_input_layer_dict[fc.name])
            
    # 把keys_emb_list和query_emb_listpooling操作
    # 因为每个商品不仅有id，还可能有类别、品牌等多个embedding向量，这种需要pooling成一个
    history_emb=PoolingLayer()(NoMask()(keys_embed_list))
    target_emb=PoolingLayer()(NoMask()(query_embed_list))
    
    hist_len=user_input_layer_dict['hist_len']
    
    # 胶囊网络，用于学习用户的兴趣表示
    high_capsule=CapsuleLayer(input_units=item_embedding_dim, out_units=item_embedding_dim, 
                              max_len=seq_max_len, k_max=k_max)((history_emb,hist_len))
    
    # 把用户的其他特征拼接到胶囊网络上
    if len(dnn_input_emb_list)>0 or len(dnn_dense_input)>0:
        user_other_feature=combined_dnn_input(dnn_input_emb_list, dnn_dense_input)
        other_feature_tile=tf.keras.layers.Lambda(tile_user_otherfeat, aruments={'k_max':k_max})(user_other_feature)
        user_deep_input=Concatenate()([NoMask()(other_feature_tile),high_capsule])
    else:
        user_deep_input=high_capsule  
        
    # 接下来经过一个DNN层，获取最终的用户表示向量
    user_embeddings=DNN(user_dnn_hidden_units, dnn_activation, reg_dnn, dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed, name='user_embedding')(user_deep_input)
    
    # 接下来，过Label-aware Layer
    # 利用LabelAwareAttention来增强用户表示，使其更加关注用户的历史行为
    if dynamic_k:
        user_embedding_final=LabelAwareAttention(k_max, k_max, pow_p=p, )((user_embedding, target_emb, hist_len))
    else:
        user_embedding_final=LabelAwareAttention(k_max=k_max, pow_p=p, )((user_embedding, target_emb))
        
    item_embedding_matrix=embedding_layer_dict[item_feature_name] # 获取doc_id的embedding层
    item_index=EmbeddingIndex(list(range(item_vocabulary_size)))(item_input_layer_dict[item_feature_name]) # 所有doc_id的索引  
    item_embedding_weight=NoMask()(item_embedding_matrix(item_index)) # 拿到所有item的embedding
    pooling_item_embedding_weight=PoolingLayer()([item_embedding_weight])
    
    # 传入整个doc_id的embedding、user_embedding，以及用户点击的doc_id，然后进行负采样计算损失操作
    output=SampledSoftmaxLayer(num_sampled)([pooling_item_embedding_weight, user_embedding_final, item_input_layer_dict[item_feature_name]])
    
    # 使用Model类构建整个推荐系统模型，包括用户和物品的输入层、嵌入层、胶囊网络、DNN层和输出层
    model=Model(inputs=user_input_layers+item_input_layers,outputs=output)
    
    # 等模型训练完之后，获取用户和item的embedding
    model.__setattr__('user_input',user_input_layers)
    model.__setattr__('user_embedding',user_embeddings)
    model.__setattr__('item_input',item_input_layers)
    model.__setattr__('item_embedding',get_item_embedding(pooling_item_embedding_weight, item_input_layer_dict[item_feature_name]))
    
    return model # 返回构建好的模型

In [None]:
# 函数式API搭建模型
# 需要传入封装好的用户特征描述以及item特征描述
# 建立模型
user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        VarLenSparseFeat(SparseFeat('hist_doc_ids', feature_max_idx['article_id'], embedding_dim,
                                                        embedding_name="click_doc_id"), his_seq_maxlen, 'mean', 'hist_len'),    
        DenseFeat('hist_len', 1),
        SparseFeat('u_city', feature_max_idx['city'], embedding_dim),
        SparseFeat('u_age', feature_max_idx['age'], embedding_dim),
        SparseFeat('u_gender', feature_max_idx['gender'], embedding_dim),
    ]
doc_feature_columns = [
    SparseFeat('click_doc_id', feature_max_idx['article_id'], embedding_dim)
    # 这里后面也可以把文章的类别画像特征加入
]


In [28]:
# 构建模型
model = MIND(user_feature_columns, item_feature_columns)

IndexError: list index out of range

In [None]:
# 把keys_emb_list和query_emb_listpooling操作
history_emb=PoolingLayer()(NoMask()(keys_embed_list))
target_emb=PoolingLayer()(NoMask()(query_embed_list))

In [None]:
# 胶囊网络
high_capsule=CapsuleLayer(input_units=item_embedding_dim, out_units=item_embedding_dim, max_len=seq_max_len, k_max=k_max)((history_emb, hist_len))

In [None]:
# 把用户的其他特征拼接到胶囊网络上
if len(dnn_input_emb_list)>0 or len(dnn_dense_input)>0:
    user_other_feature=combined_dnn_input(dnn_input_emb_list, dnn_dense_input)
    other_feature_tile=tf.keras.layers.Lambda(tile_user_otherfeat, argument={'k_max':k_max})(user_other_feature)
    user_deep_input=Concatenate()([NoMask()(other_feature_tile), high_capsule])
else:
    user_deep_input=high_capsule
    

In [None]:
# 经过一个DNN层，获取最终的用户表示向量
user_embeddings = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn,
                          dnn_dropout, dnn_use_bn, output_activation=output_activation, seed=seed,
                          name="user_embedding")(user_deep_input) 

In [None]:
# 经过LabelAwareAttention层
# 对这两个兴趣向量与当前item的相关性加注意力权重，最后变成1个用户的最终向量
user_embedding_final = LabelAwareAttention(k_max=k_max, pow_p=p,)((user_embeddings, target_emb))
