In [2]:
# -*- coding:utf-8 -*-
"""
Author:
    zanshuxun, zanshuxun@aliyun.com
    songwei, magic_24k@163.com

Reference:
    [1] [Jiaqi Ma, Zhe Zhao, Xinyang Yi, et al. Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts[C]](https://dl.acm.org/doi/10.1145/3219819.3220007)
"""
import torch
import torch.nn as nn

from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.inputs import combined_dnn_input
from deepctr_torch.layers import DNN, PredictionLayer, CIN
import pandas as pd

class MMOELayer(nn.Module):
    """
    The Multi-gate Mixture-of-Experts layer in MMOE model
      Input shape
        - 2D tensor with shape: ``(batch_size,units)``.

      Output shape
        - A list with **num_tasks** elements, which is a 2D tensor with shape: ``(batch_size, output_dim)`` .

      Arguments
        - **input_dim** : Positive integer, dimensionality of input features.
        - **num_tasks**: integer, the number of tasks, equal to the number of outputs.
        - **num_experts**: integer, the number of experts.
        - **output_dim**: integer, the dimension of each output of MMOELayer.

    References
      - [Jiaqi Ma, Zhe Zhao, Xinyang Yi, et al. Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts[C]](https://dl.acm.org/doi/10.1145/3219819.3220007)
    """

    def __init__(self, input_dim, num_tasks, num_experts, output_dim):
        super(MMOELayer, self).__init__()
        self.input_dim = input_dim
        self.num_experts = num_experts
        self.num_tasks = num_tasks
        self.output_dim = output_dim
        self.expert_network = nn.Linear(self.input_dim, self.num_experts * self.output_dim, bias=True)
        self.gating_networks = nn.ModuleList(
            [nn.Linear(self.input_dim, self.num_experts, bias=False) for _ in range(self.num_tasks)])
        # initial model
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight)

    def forward(self, inputs):
        outputs = []
        expert_out = self.expert_network(inputs)
        expert_out = expert_out.reshape([-1, self.output_dim, self.num_experts])
        for i in range(self.num_tasks):
            gate_out = self.gating_networks[i](inputs)
            gate_out = gate_out.softmax(1).unsqueeze(-1)
            output = torch.bmm(expert_out, gate_out).squeeze()
            outputs.append(output)
        return outputs


class MMOE(BaseModel):
    """Instantiates the Multi-gate Mixture-of-Experts architecture.

    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param num_tasks: integer, number of tasks, equal to number of outputs, must be greater than 1.
    :param tasks: list of str, indicating the loss of each tasks, ``"binary"`` for  binary logloss, ``"regression"`` for regression loss. e.g. ['binary', 'regression']
    :param num_experts: integer, number of experts.
    :param expert_dim: integer, the hidden units of each expert.
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of shared-bottom DNN
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param init_std: float,to use as the initialize std of embedding vector
    :param task_dnn_units: list,list of positive integer or empty list, the layer number and units in each layer of task-specific DNN
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
    :param device: str, ``"cpu"`` or ``"cuda:0"``

    :return: A PyTorch model instance.
    """

    def __init__(self, dnn_feature_columns, num_tasks, tasks, num_experts=4, expert_dim=8, dnn_hidden_units=(128, 128),
                 l2_reg_embedding=1e-5, l2_reg_dnn=0, init_std=0.0001, task_dnn_units=None, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, device='cpu', gpus=[0, 1]):
        
        super(MMOE, self).__init__(linear_feature_columns=[], dnn_feature_columns=dnn_feature_columns,
                                   l2_reg_embedding=l2_reg_embedding, seed=seed, device=device)
        if num_tasks <= 1:
            raise ValueError("num_tasks must be greater than 1")
        if len(tasks) != num_tasks:
            raise ValueError("num_tasks must be equal to the length of tasks")
        for task in tasks:
            if task not in ['binary', 'regression']:
                raise ValueError("task must be binary or regression, {} is illegal".format(task))

        self.tasks = tasks
        self.task_dnn_units = task_dnn_units
        self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
                       init_std=init_std, device=device)
        self.mmoe_layer = MMOELayer(dnn_hidden_units[-1], num_tasks, num_experts, expert_dim)
        if task_dnn_units is not None:
            # the last layer of task_dnn should be expert_dim
            self.task_dnn = nn.ModuleList([DNN(expert_dim, task_dnn_units+(expert_dim,)) for _ in range(num_tasks)])
        self.tower_network = nn.ModuleList([nn.Linear(expert_dim, 1, bias=False) for _ in range(num_tasks)])
        self.out = nn.ModuleList([PredictionLayer(task) for task in self.tasks])
        self.to(device)
        
        # 加入cin
        cin_layer_size=(256, 128, 64,)
        self.cin_layer_size=cin_layer_size
        cin_split_half=True
        cin_activation='relu'
        l2_reg_cin=0
        self.use_cin = len(self.cin_layer_size) > 0 and len(dnn_feature_columns) > 0
        if self.use_cin:
            field_num = len(self.embedding_dict)
            if cin_split_half == True:
                self.featuremap_num = sum(
                    cin_layer_size[:-1]) // 2 + cin_layer_size[-1]
            else:
                self.featuremap_num = sum(cin_layer_size)
            self.cin = CIN(field_num, cin_layer_size,
                           cin_activation, cin_split_half, l2_reg_cin, seed, device=device)
            self.cin_linear = nn.Linear(self.featuremap_num, 1, bias=False).to(device)
            self.add_regularization_weight(filter(lambda x: 'weight' in x[0], self.cin.named_parameters()),
                                           l2=l2_reg_cin)

    def forward(self, X):
        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
                                                                           self.embedding_dict)
        # 加入cin模块
        if self.use_cin:
            print(sparse_embedding_list)
            cin_input = torch.cat(sparse_embedding_list, dim=1)
            cin_output = self.cin(cin_input)
            cin_logit = self.cin_linear(cin_output)
        # 正常dnn
        dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
        dnn_output = self.dnn(dnn_input)
        mmoe_outs = self.mmoe_layer(dnn_output)
        if self.task_dnn_units is not None:
            mmoe_outs = [self.task_dnn[i](mmoe_out) for i, mmoe_out in enumerate(mmoe_outs)]

        task_outputs = []
        for i, mmoe_out in enumerate(mmoe_outs):
            # cin的损失加入
            logit = self.tower_network[i](mmoe_out) + cin_logit
            output = self.out[i](logit)
            task_outputs.append(output)

        task_outputs = torch.cat(task_outputs, -1)
        return task_outputs


In [3]:
import os
import torch
import pandas as pd
import numpy as np
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import sys
BASE_DIR = '.'
sys.path.append(os.path.join(BASE_DIR, '../../config'))
sys.path.append(os.path.join(BASE_DIR, '../model'))
sys.path.append(os.path.join(BASE_DIR, '../utils'))
from config import *
from time import time
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from sklearn.preprocessing import MinMaxScaler
import datatable as dt
from mmoe import MMOE
from evaluation import evaluate_deepctr
import pickle

# 训练相关参数设置
ONLINE_FLAG = False  # 是否准备线上提交

# 指定GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
epochs = 3
batch_size = 1024
embedding_dim = 10
target = ['read_comment', 'like', 'click_avatar', 'forward', 'comment', 'favorite', 'follow']
# tagids = ['manual_tag_' + str(tagid) for tagid in range(11)] # 11
# keyids = ['manual_key_' + str(keyid) for keyid in range(18)] # 18
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features = ['videoplayseconds', ]

data = dt.fread(USER_ACTION)
data = data.to_pandas()
feed = dt.fread(FEED_INFO)
feed = feed.to_pandas()
# tag = dt.fread(FEATURE_PATH + '/feed_info_tags_keys_des.csv')
# tag = tag.to_pandas()[tagids + keyids + ['feedid']]
# feed_emb = np.load(FEATURE_PATH + '/feed_embeddings_PCA64.npy')
# feed_emb = torch.from_numpy(feed_emb).float().to(device)
# pkl = open(FEATURE_PATH + '/feedid_map.pkl', 'rb')
# feedid_map = pickle.load(pkl)
# pkl.close()

feed[["bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \
    feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
feed['bgm_song_id'] = feed['bgm_song_id'].astype('int64')
feed['bgm_singer_id'] = feed['bgm_singer_id'].astype('int64')
data = data.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']], how='left',
                  on='feedid')
# data['feedid'] = feedid_map.transform(data['feedid'])
# data = data.merge(tag, how='left', on='feedid')
# test = dt.fread(TEST_FILE)
# test = test.to_pandas()
# test = test.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']], how='left',
#                   on='feedid')

data[dense_features] = data[dense_features].fillna(0, )
# test[dense_features] = test[dense_features].fillna(0, )
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
# test[dense_features] = mms.fit_transform(test[dense_features])

print('data.shape', data.shape)
print('data.columns', data.columns.tolist())
print('unique date_: ', data['date_'].unique())

if ONLINE_FLAG:
    train = data
else:
    train = data[data['date_'] < 14]
val = data[data['date_'] == 14]  # 第14天样本作为验证集，当ONLINE_FLAG=False时使用。


Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history
data.shape (73175511, 17)
data.columns ['userid', 'feedid', 'date_', 'device', 'read_comment', 'comment', 'like', 'play', 'stay', 'click_avatar', 'forward', 'follow', 'favorite', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']
unique date_:  [ 2  3  4  5  7  8  9 10 11  6 14  1 12 13]


In [4]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
                          for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=350 + 1, embedding_dim=embedding_dim)
#                           for feat in tagids]
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=23262 + 1, embedding_dim=embedding_dim)
#                           for feat in keyids]
# fixlen_feature_columns += [SparseFeat('feed_embedding', 112871+1, embedding_dim=64)]
                    
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)

train_model_input = {name: train[name] for name in feature_names if name not in ['feed_embedding']}
val_model_input = {name: val[name] for name in feature_names if name not in ['feed_embedding']}
# train_model_input['feed_embedding'] = train_model_input['feedid']
# val_model_input['feed_embedding'] = val_model_input['feedid']

userid_list = val['userid'].astype(str).tolist()
# test_model_input = {name: test[name] for name in feature_names}

train_labels = train[target].values
val_labels = [val[y].values for y in target]

# 4.Define Model,train,predict and evaluate
train_model = MMOE(dnn_feature_columns, num_tasks=7, num_experts=13, expert_dim=256, dnn_hidden_units=(128, 128, 128, 64),
                   task_dnn_units=(128, 128, 64), 
#                    dnn_activation='dice',
                   tasks=['binary', 'binary', 'binary', 'binary', 'binary', 'binary', 'binary'], device=device)
# train_model.embedding_dict['feed_embedding'] = nn.Embedding.from_pretrained(feed_emb)

train_model.compile("adagrad", loss='binary_crossentropy')
# print(train_model.summary())
for epoch in range(epochs):
    history = train_model.fit(train_model_input, train_labels,
                              batch_size=batch_size, epochs=1, verbose=1)
    if not ONLINE_FLAG:
        val_pred_ans = train_model.predict(val_model_input, batch_size=batch_size * 4)
        # 模型predict()返回值格式为(?, 4)，与tf版mmoe不同。因此下方用到了transpose()进行变化。
        evaluate_deepctr(val_labels, val_pred_ans.transpose(), userid_list, target)

# t1 = time()
# pred_ans = train_model.predict(test_model_input, batch_size=batch_size * 20)
# pred_ans = pred_ans.transpose()
# t2 = time()
# print('7个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
# ts = (t2 - t1) * 1000.0* 2000.0 / (len(test)*7.0) 
# print('7个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)


cuda
Train on 67071556 samples, validate on 0 samples, 65500 steps per epoch


65500it [57:37, 18.94it/s]


Epoch 1/1
3457s - loss:  0.2591
{'read_comment': 0.6281239123358137, 'like': 0.6222426049697215, 'click_avatar': 0.71415674846787, 'forward': 0.6826159944676095, 'comment': 0.5709973119489229, 'favorite': 0.7225344655460522, 'follow': 0.6890881204851775}
Weighted uAUC:  0.651752
cuda
Train on 67071556 samples, validate on 0 samples, 65500 steps per epoch


65500it [57:40, 18.93it/s]


Epoch 1/1
3460s - loss:  0.2405
{'read_comment': 0.6344485807956676, 'like': 0.6234827659183279, 'click_avatar': 0.7157806271524819, 'forward': 0.6913918767484204, 'comment': 0.5799734092727782, 'favorite': 0.7289536672410949, 'follow': 0.6845573121278574}
Weighted uAUC:  0.655745
cuda
Train on 67071556 samples, validate on 0 samples, 65500 steps per epoch


65500it [56:58, 19.16it/s]


Epoch 1/1
3418s - loss:  0.2362
{'read_comment': 0.6366060823656043, 'like': 0.6231750422031819, 'click_avatar': 0.7181202166078691, 'forward': 0.6947179428982619, 'comment': 0.5791335498635278, 'favorite': 0.7265931063021266, 'follow': 0.6837912814482804}
Weighted uAUC:  0.656648
