In [1]:
import numpy as np

import gc
import os
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.layers import *
from tensorflow.python.keras.layers import Layer
from tensorflow.keras import regularizers

from tensorflow.keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping

from tensorflow.keras import optimizers,initializers
from tensorflow.python.keras.initializers import glorot_normal
import joblib
import json

from tensorflow.keras import layers, Model
import utils
from utils import *
import importlib
importlib.reload(utils)

from features import *
from tensorflow.keras.callbacks import LambdaCallback


In [2]:
train = joblib.load('./data_and_feature/train.txt')
val = joblib.load('./data_and_feature/val.txt')
test = joblib.load('./data_and_feature/test.txt')
encoder = joblib.load('./data_and_feature/encoder.txt')

train_num = len(train)

In [3]:
embedding_feat_dict=read_json_file('/Users/bytedance/Desktop/wechat_multi_task_learning/config/embedding_feat_dict.json')

In [4]:
embedding_feat_dict

{'dense': ['videoplayseconds'],
 'sparse': {'userid': {'vocab_size': 20001,
   'embedding_dim': 14,
   'dtype': 'int64'},
  'feedid': {'vocab_size': 99172, 'embedding_dim': 16, 'dtype': 'int64'},
  'authorid': {'vocab_size': 18624, 'embedding_dim': 14, 'dtype': 'int64'},
  'bgm_song_id': {'vocab_size': 23740, 'embedding_dim': 14, 'dtype': 'int64'},
  'bgm_singer_id': {'vocab_size': 16603,
   'embedding_dim': 14,
   'dtype': 'int64'}},
 'sequence': {'manual_tag_list': {'vocab_size': 12, 'embedding_dim': 4},
  'manual_keyword_list': {'vocab_size': 19, 'embedding_dim': 4}}}

In [5]:
train.head()

Unnamed: 0,videoplayseconds,userid,feedid,authorid,bgm_song_id,bgm_singer_id,manual_tag_list,manual_keyword_list,read_comment,like,click_avatar,forward
1493914,1.503301,4095,6889,3254,749,666,"[44, 32, 9, 2, 0, 0, 0, 0, 0, 0, 0]","[1715, 1100, 707, 779, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0,0.0,0.0,0.0
3165952,1.439569,8673,49911,4996,20,20,"[208, 90, 9, 2, 0, 0, 0, 0, 0, 0, 0]","[1176, 13155, 906, 13156, 0, 0, 0, 0, 0, 0, 0,...",0.0,0.0,0.0,0.0
6882393,1.63137,18789,11461,1828,2,2,"[182, 72, 9, 2, 0, 0, 0, 0, 0, 0, 0]","[183, 4224, 4225, 4226, 0, 0, 0, 0, 0, 0, 0, 0...",0.0,0.0,0.0,0.0
1659385,1.327761,4533,11871,2576,3734,2978,"[5, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.0,0.0,0.0
3597267,1.248441,9815,20583,6880,6356,4961,"[5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,0.0,0.0,0.0


In [6]:
train.columns

Index(['videoplayseconds', 'userid', 'feedid', 'authorid', 'bgm_song_id',
       'bgm_singer_id', 'manual_tag_list', 'manual_keyword_list',
       'read_comment', 'like', 'click_avatar', 'forward'],
      dtype='object')

In [7]:
target = ["read_comment", "like", "click_avatar", "forward"]
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
varlen_features = ['manual_tag_list','manual_keyword_list']
dense_features = ['videoplayseconds']

# 生成输入特征
# sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features}
# varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features}
feature_names = sparse_features+varlen_features+dense_features

# 构建输入数据
train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入，字典类型。名称和具体值
val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names }
test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names}

train_labels = [train[y].values for y in target]
val_labels = [val[y].values for y in target]



In [8]:
sparse_features = embedding_feat_dict['sparse']

sequence_features = embedding_feat_dict['sequence']

dense_features = embedding_feat_dict['dense']

task_names = ["read_comment", "like", "click_avatar", "forward"]


In [9]:
def build_inputs():
    inputs = {}

    # Sparse feature inputs
    for feat in sparse_features:
        inputs[feat] = tf.keras.Input(shape=(), name=feat, dtype=tf.int32)

    # Sequence feature inputs
    for feat in sequence_features:
        inputs[feat] = tf.keras.Input(shape=(None,), name=feat, dtype=tf.int32)

    # Dense feature inputs
    for feat in dense_features:
        inputs[feat] = tf.keras.Input(shape=(1,), name=feat, dtype=tf.float32)

    return inputs

In [10]:
def build_embeddings(inputs):
    embed_list = []

    # Sparse features
    for feat, conf in sparse_features.items():
        x = tf.keras.layers.Hashing(num_bins=conf["vocab_size"])(inputs[feat])
        embed = tf.keras.layers.Embedding(input_dim=conf["vocab_size"], output_dim=conf["embedding_dim"])(x)
        embed_list.append(embed)

    # Sequence features
    for feat, conf in sequence_features.items():
        seq_embed = tf.keras.layers.Embedding(input_dim=conf["vocab_size"], output_dim=conf["embedding_dim"])(inputs[feat])
        pooled = tf.keras.layers.GlobalAveragePooling1D()(seq_embed)
        embed_list.append(pooled)

    return embed_list

In [11]:
class MMoE(layers.Layer):
    def __init__(self, units, num_experts, num_tasks):
        super().__init__()
        self.num_experts = num_experts
        self.num_tasks = num_tasks
        self.experts = [layers.Dense(units, activation='relu') for _ in range(num_experts)]
        self.gates = [layers.Dense(num_experts, activation='softmax') for _ in range(num_tasks)]

    def call(self, inputs):
        # Expert outputs: list -> (batch_size, expert_dim)
        expert_outputs = [expert(inputs) for expert in self.experts]  # list of tensors
        expert_stack = tf.stack(expert_outputs, axis=1)  # (batch, num_experts, dim)

        task_outputs = []
        for gate in self.gates:
            gate_weights = gate(inputs)  # (batch, num_experts)
            gate_weights = tf.expand_dims(gate_weights, axis=-1)  # (batch, num_experts, 1)
            gated_output = tf.reduce_sum(expert_stack * gate_weights, axis=1)  # (batch, dim)
            task_outputs.append(gated_output)
        return task_outputs  # List of (batch, dim)


In [12]:
def build_mmoe_model():
    inputs = build_inputs()
    embed_list = build_embeddings(inputs)

    # Dense features
    for feat in dense_features:
        embed_list.append(inputs[feat])

    concat_embed = layers.Concatenate()(embed_list)

    # MMoE
    mmoe_outputs = MMoE(units=64, num_experts=8, num_tasks=4)(concat_embed)

    # Task-specific towers
    task_outputs = []
    for i, task_output in enumerate(mmoe_outputs):
        x = layers.Dense(32, activation='relu')(task_output)
        logit = layers.Dense(1, activation='sigmoid', name=task_names[i])(x)
        task_outputs.append(logit)

    model = Model(inputs=inputs, outputs=task_outputs)
    return model


In [13]:
model = build_mmoe_model()
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['AUC']
)

model.summary()


In [14]:
def preprocess_data(df):
    model_input={}
    for feat in sparse_features:
        model_input[feat]=df[feat].values.astype('int64')
    for feat in dense_features:
        model_input[feat]=df[feat].values.astype('float32')
    
    def str2list(s):
        if isinstance(s, str):
            return [int(i) for i in s.strip().split()]
        else:
            return []
    
    for feat in varlen_features:
        sequences = df[feat].tolist()
        model_input[feat]=sequences
    return model_input

In [15]:
model_input=preprocess_data(train)

In [16]:
label_dict = {name: train[name].values.astype('float32') for name in target}

In [None]:

print_callback = LambdaCallback(
    on_epoch_end=lambda epoch, logs: print(f"Epoch {epoch + 1}: loss = {logs['loss']:.4f}")
)

model.fit(
    model_input,
    label_dict,
    batch_size=256,
    epochs=10,
    validation_split=0.1,
    verbose=1,
    callbacks=[print_callback]
)