In [38]:
# -*- coding:utf-8 -*-
"""
Author:
    zanshuxun, zanshuxun@aliyun.com
    songwei, magic_24k@163.com

Reference:
    [1] [Jiaqi Ma, Zhe Zhao, Xinyang Yi, et al. Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts[C]](https://dl.acm.org/doi/10.1145/3219819.3220007)
"""
import torch
import torch.nn as nn

from deepctr_torch.models.basemodel import BaseModel
from deepctr_torch.inputs import combined_dnn_input, embedding_lookup, maxlen_lookup
from deepctr_torch.layers import DNN, PredictionLayer, CIN, concat_fun, InteractingLayer
from deepctr_torch.layers.sequence import AttentionSequencePoolingLayer
import pandas as pd

class MMOELayer(nn.Module):
    """
    The Multi-gate Mixture-of-Experts layer in MMOE model
      Input shape
        - 2D tensor with shape: ``(batch_size,units)``.

      Output shape
        - A list with **num_tasks** elements, which is a 2D tensor with shape: ``(batch_size, output_dim)`` .

      Arguments
        - **input_dim** : Positive integer, dimensionality of input features.
        - **num_tasks**: integer, the number of tasks, equal to the number of outputs.
        - **num_experts**: integer, the number of experts.
        - **output_dim**: integer, the dimension of each output of MMOELayer.

    References
      - [Jiaqi Ma, Zhe Zhao, Xinyang Yi, et al. Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts[C]](https://dl.acm.org/doi/10.1145/3219819.3220007)
    """

    def __init__(self, input_dim, num_tasks, num_experts, output_dim):
        super(MMOELayer, self).__init__()
        self.input_dim = input_dim
        self.num_experts = num_experts
        self.num_tasks = num_tasks
        self.output_dim = output_dim
        self.expert_network = nn.Linear(self.input_dim, self.num_experts * self.output_dim, bias=True)
        self.gating_networks = nn.ModuleList(
            [nn.Linear(self.input_dim, self.num_experts, bias=False) for _ in range(self.num_tasks)])
        # initial model
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight)

    def forward(self, inputs):
        outputs = []
        expert_out = self.expert_network(inputs)
        expert_out = expert_out.reshape([-1, self.output_dim, self.num_experts])
        for i in range(self.num_tasks):
            gate_out = self.gating_networks[i](inputs)
            gate_out = gate_out.softmax(1).unsqueeze(-1)
            output = torch.bmm(expert_out, gate_out).squeeze()
            outputs.append(output)
        return outputs


class MMOE(BaseModel):
    """Instantiates the Multi-gate Mixture-of-Experts architecture.

    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param num_tasks: integer, number of tasks, equal to number of outputs, must be greater than 1.
    :param tasks: list of str, indicating the loss of each tasks, ``"binary"`` for  binary logloss, ``"regression"`` for regression loss. e.g. ['binary', 'regression']
    :param num_experts: integer, number of experts.
    :param expert_dim: integer, the hidden units of each expert.
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of shared-bottom DNN
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param init_std: float,to use as the initialize std of embedding vector
    :param task_dnn_units: list,list of positive integer or empty list, the layer number and units in each layer of task-specific DNN
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
    :param device: str, ``"cpu"`` or ``"cuda:0"``

    :return: A PyTorch model instance.
    """

    def __init__(self, dnn_feature_columns, history_feature_list, num_tasks, tasks, num_experts=4, expert_dim=8, dnn_hidden_units=(128, 128),
                 l2_reg_embedding=1e-5, l2_reg_dnn=0, init_std=0.0001, task_dnn_units=None, seed=1024, dnn_dropout=0,
                 dnn_activation='relu', dnn_use_bn=False, device='cpu', gpus=[0, 1]):
        
        super(MMOE, self).__init__(linear_feature_columns=[], dnn_feature_columns=dnn_feature_columns,
                                   l2_reg_embedding=l2_reg_embedding, seed=seed, device=device)
        if num_tasks <= 1:
            raise ValueError("num_tasks must be greater than 1")
        if len(tasks) != num_tasks:
            raise ValueError("num_tasks must be equal to the length of tasks")
        for task in tasks:
            if task not in ['binary', 'regression']:
                raise ValueError("task must be binary or regression, {} is illegal".format(task))
        self.sparse_feature_columns = list(
            filter(lambda x: isinstance(x, SparseFeat), dnn_feature_columns)) if dnn_feature_columns else []
        self.varlen_sparse_feature_columns = list(
            filter(lambda x: isinstance(x, VarLenSparseFeat), dnn_feature_columns)) if dnn_feature_columns else []
        
        # atten tag key
        self.history_feature_list = history_feature_list
        self.history_feature_columns = []
        self.sparse_varlen_feature_columns = []
        self.history_fc_names = list(map(lambda x: "hist_" + x, history_feature_list))
        for fc in self.varlen_sparse_feature_columns:
            feature_name = fc.name
            if feature_name in self.history_fc_names:
                self.history_feature_columns.append(fc)
            else:
                self.sparse_varlen_feature_columns.append(fc)
        # din component
#         att_emb_dim = self._compute_interest_dim()
#         print(att_emb_dim)
#         att_activation='Dice'
#         att_weight_normalization=False
#         att_hidden_size=(128, 128, 64)
#         self.attention = AttentionSequencePoolingLayer(att_hidden_units=att_hidden_size,
#                                                        embedding_dim=att_emb_dim,
#                                                        att_activation=att_activation,
#                                                        return_score=False,
#                                                        supports_masking=False,
#                                                        weight_normalization=att_weight_normalization)
        
        self.din_linear = nn.Linear(40, 1, bias=False).to(device)
#         self.din_out = PredictionLayer('binary')
        # MMOE
        self.tasks = tasks
        self.task_dnn_units = task_dnn_units
        self.dnn = DNN(self.compute_input_dim(dnn_feature_columns), dnn_hidden_units,
                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
                       init_std=init_std, device=device)
#         self.before_mmoe = DNN(dnn_hidden_units[-1]+600+1200+12600-10800-1575, dnn_hidden_units,
#                        activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, use_bn=dnn_use_bn,
#                        init_std=init_std, device=device)
        self.mmoe_layer = MMOELayer(dnn_hidden_units[-1]+600+1200+12600-10800-1575, num_tasks, num_experts, expert_dim)
        if task_dnn_units is not None:
            # the last layer of task_dnn should be expert_dim
            self.task_dnn = nn.ModuleList([DNN(expert_dim, task_dnn_units+(expert_dim,)) for _ in range(num_tasks)])
        self.tower_network = nn.ModuleList([nn.Linear(expert_dim, 1, bias=False) for _ in range(num_tasks)])
        self.out = nn.ModuleList([PredictionLayer(task) for task in self.tasks])
        self.to(device)
        
        # 加入cin
        cin_layer_size=(256, 128, 64)
        self.cin_layer_size=cin_layer_size
        cin_split_half=True
        cin_activation='relu'
        l2_reg_cin=0
        self.use_cin = len(self.cin_layer_size) > 0 and len(dnn_feature_columns) > 0
        if self.use_cin:
            field_num = len(self.sparse_feature_columns)
            if cin_split_half == True:
                self.featuremap_num = sum(
                    cin_layer_size[:-1]) // 2 + cin_layer_size[-1]
            else:
                self.featuremap_num = sum(cin_layer_size)
            self.cin = CIN(field_num, cin_layer_size,
                           cin_activation, cin_split_half, l2_reg_cin, seed, device=device)
            self.cin_linear = nn.Linear(self.featuremap_num, 1, bias=False).to(device)
            self.add_regularization_weight(filter(lambda x: 'weight' in x[0], self.cin.named_parameters()),
                                           l2=l2_reg_cin)
        
        # multi-head atten
        att_embedding_size=15
        att_head_num=15
        att_layer_num=3
        att_res=True
        self.int_layers = nn.ModuleList(
            [InteractingLayer(self.embedding_size if i == 0 else att_embedding_size * att_head_num,
                              att_embedding_size, att_head_num, att_res, device=device) for i in range(att_layer_num)])
        
        print('sparse_features: ', self.sparse_feature_columns)
        print('varlen_features: ', self.varlen_sparse_feature_columns)
    
    def _compute_interest_dim(self):
        interest_dim = 0
        for feat in self.sparse_feature_columns:
            if feat.name in self.history_feature_list:
                interest_dim += feat.embedding_dim
        return interest_dim
    
    def forward(self, X):
#         print(self.embedding_dict)
        _, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
                                                                           self.embedding_dict)
        sparse_embedding_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.sparse_feature_columns,
                                              to_list=True)
        # user tag_key atten
#         query_emb_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.sparse_feature_columns,
#                                           return_feat_list=self.history_feature_list, to_list=True)
#         keys_emb_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.history_feature_columns,
#                                          return_feat_list=self.history_fc_names, to_list=True)
#         query_emb = torch.cat(query_emb_list, dim=-1)                     # [B, 1, E]
#         keys_emb = torch.cat(keys_emb_list, dim=-1)                       # [B, T, E]
#         keys_length_feature_name = [feat.length_name for feat in self.varlen_sparse_feature_columns if
#                                     feat.length_name is not None]
#         keys_length = torch.squeeze(maxlen_lookup(X, self.feature_index, keys_length_feature_name), 1)  # [B, 1]
#         hist = self.attention(query_emb, keys_emb, keys_length)           # [B, 1, E]
#         din_logit = self.din_linear(hist).squeeze(1)
        
        
        # 加入cin模块
        if self.use_cin:
            cin_input = torch.cat(sparse_embedding_list, dim=1)
            cin_output = self.cin(cin_input) # 1024, 256
            cin_logit = self.cin_linear(cin_output)
            
#         print('cin_logit: ', cin_logit)  
#         print('din_logit: ', din_logit, din_logit.shape)
        # muti-head
        att_input = concat_fun(sparse_embedding_list, axis=1)
        for layer in self.int_layers:
            att_input = layer(att_input)
        att_output = torch.flatten(att_input, start_dim=1)
        
        dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list) # 1024, 101
#         print('dnn_input: ', dnn_input.shape)
#         print('hist: ', hist.shape)
        dnn_input = torch.cat((dnn_input, hist.squeeze(1)), dim=-1)
        dnn_output = self.dnn(dnn_input)
        dnn_output = concat_fun([att_output, dnn_output])
#         dnn_output = self.before_mmoe(dnn_output)
        mmoe_outs = self.mmoe_layer(dnn_output)
        if self.task_dnn_units is not None:
            mmoe_outs = [self.task_dnn[i](mmoe_out) for i, mmoe_out in enumerate(mmoe_outs)]

        task_outputs = []
        for i, mmoe_out in enumerate(mmoe_outs):
            logit = self.tower_network[i](mmoe_out) + cin_logit + din_logit
            output = self.out[i](logit)
            task_outputs.append(output)
#         print(cin_logit.shape, din_logit.shape)
#         print(task_outputs.shape)
        task_outputs = torch.cat(task_outputs, -1)
        return task_outputs


In [39]:
import os
import torch
import pandas as pd
import numpy as np
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import sys
# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = '.'
sys.path.append(os.path.join(BASE_DIR, '../../config'))
sys.path.append(os.path.join(BASE_DIR, '../model'))
sys.path.append(os.path.join(BASE_DIR, '../utils'))
from config import *
from time import time
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names, VarLenSparseFeat
from sklearn.preprocessing import MinMaxScaler
import datatable as dt
# from mmoe import MMOE
from evaluation import evaluate_deepctr
import pickle
import gc

# 训练相关参数设置
ONLINE_FLAG = False  # 是否准备线上提交

# 指定GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

vocab_dict = {
    'bgm_song_id': 25158+1,
    'bgm_singer_id': 17499+1,
    'userid': 199999,
    'feedid': 112871+1,
    'authorid': 18788+1,
}

# if __name__ == "__main__":
epochs = 1
batch_size = 1024
embedding_dim = 20
target = ['read_comment', 'like', 'click_avatar', 'forward', 'comment', 'favorite', 'follow']
tagids = ['manual_tag_' + str(tagid) for tagid in range(11)] # 11
keyids = ['manual_key_' + str(keyid) for keyid in range(11)] # 18
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
dense_features = ['videoplayseconds', ]

feed = dt.fread(FEED_INFO)
feed = feed.to_pandas()
tag = dt.fread(FEATURE_PATH + '/feed_info_tags_keys_des_seq_len.csv')
tag = tag.to_pandas()[tagids + keyids + ['feedid', 'tag_seq_len', 'key_seq_len']]

pkl = open(FEATURE_PATH + '/user_encoder.pkl', 'rb')
userid_map = pickle.load(pkl)
pkl.close()
mms = MinMaxScaler(feature_range=(0, 1))

user_emb1 = np.load(FEATURE_PATH + '/user_emb_normal.npy')
user_emb1 = torch.from_numpy(user_emb1).float().to(device)
user_emb2 = np.load(FEATURE_PATH + '/user_emb_adjust.npy')
user_emb2 = torch.from_numpy(user_emb2).float().to(device)

feed[["bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \
    feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
feed['bgm_song_id'] = feed['bgm_song_id'].astype('int64')
feed['bgm_singer_id'] = feed['bgm_singer_id'].astype('int64')

if ONLINE_FLAG:
    data = pd.read_csv(USER_ACTION, iterator=True)
else:
    val = pd.read_csv(FEATURE_PATH + '/val_data.csv')
    data = pd.read_csv(FEATURE_PATH + '/train_data.csv', iterator=True)



#     fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
#                           for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=vocab_dict[feat] + 1, embedding_dim=embedding_dim)
                      for feat in vocab_dict.keys()] + [DenseFeat(feat, 1) for feat in dense_features]
fixlen_feature_columns += [VarLenSparseFeat(SparseFeat('tagids', vocabulary_size=350 + 1, embedding_dim=embedding_dim), 11, length_name='tag_seq_len')]
fixlen_feature_columns += [VarLenSparseFeat(SparseFeat('keyids', vocabulary_size=23262 + 1, embedding_dim=embedding_dim), 11, length_name='key_seq_len')]
# fixlen_feature_columns += [SparseFeat('tagids', vocabulary_size=200000, embedding_dim=embedding_dim)]
# fixlen_feature_columns += [SparseFeat('keyids', vocabulary_size=200000, embedding_dim=embedding_dim)]

# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=350 + 1, embedding_dim=embedding_dim)
#                           for feat in tagids]
# fixlen_feature_columns += [SparseFeat(feat, vocabulary_size=23262 + 1, embedding_dim=embedding_dim)
#                           for feat in keyids]
# fixlen_feature_columns += [SparseFeat('feed_embedding', 112871+1, embedding_dim=64)]
fixlen_feature_columns += [SparseFeat('user_embedding_normal', 200000, embedding_dim=embedding_dim)]
fixlen_feature_columns += [SparseFeat('user_embedding_adjust', 200000, embedding_dim=embedding_dim)]

dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)
print('use features: ', feature_names)


if ONLINE_FLAG:
    # 加入test
    test = dt.fread(TEST_FILE)
    test = test.to_pandas()
    test = test.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']], how='left',
                      on='feedid')
    test = test.merge(tag, how='left', on='feedid')
    test[dense_features] = test[dense_features].fillna(0, )
    test[dense_features] = mms.fit_transform(test[dense_features])
    test['userid'] = userid_map.transform(test['userid'])

    test_model_input = {name: test[name] for name in feature_names if name not in ['feed_embedding', 'user_embedding_normal', 'user_embedding_adjust', 'hist_tagids', 'hist_keyids', 'tagids', 'keyids']}
    test_model_input['user_embedding_normal'] = test_model_input['userid']
    test_model_input['user_embedding_adjust'] = test_model_input['userid']
    test_model_input['tagids'] = test[['manual_tag_' + str(index) for index in range(11)]].values
    test_model_input['keyids'] = test[['manual_key_' + str(index) for index in range(11)]].values
else:
    val = val.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']], how='left',
                  on='feedid')
    val = val.merge(tag, how='left', on='feedid')
    val[dense_features] = val[dense_features].fillna(0, )
    val[dense_features] = mms.fit_transform(val[dense_features])
    val['userid'] = userid_map.transform(val['userid'])
    val_model_input = {name: val[name] for name in feature_names if name not in ['hist_tagids', 'hist_keyids', 'feed_embedding', 'user_embedding_normal', 'user_embedding_adjust', 'tagids', 'keyids']}
    val_model_input['tagids'] = val[['manual_tag_' + str(index) for index in range(11)]].values
    val_model_input['keyids'] = val[['manual_key_' + str(index) for index in range(11)]].values
#     val_model_input['tagids'] = val_model_input['userid']
#     val_model_input['keyids'] = val_model_input['userid']
    val_model_input['user_embedding_normal'] = val_model_input['userid']
    val_model_input['user_embedding_adjust'] = val_model_input['userid']
    userid_list = val['userid'].astype(str).tolist()
    val_labels = [val[y].values for y in target]



use features:  ['bgm_song_id', 'bgm_singer_id', 'userid', 'feedid', 'authorid', 'videoplayseconds', 'tagids', 'tag_seq_len', 'keyids', 'key_seq_len', 'user_embedding_normal', 'user_embedding_adjust']


In [40]:
train_model = MMOE(dnn_feature_columns, history_feature_list=[], num_tasks=7, num_experts=13, expert_dim=128, dnn_hidden_units=(128, 128, 64),
                   task_dnn_units=(128, 128, 64),
                   tasks=['binary', 'binary', 'binary', 'binary', 'binary', 'binary', 'binary'], device=device)
train_model.compile("adagrad", loss='binary_crossentropy')
train_model.embedding_dict['user_embedding_normal'] = nn.Embedding.from_pretrained(user_emb1, freeze=False)
train_model.embedding_dict['user_embedding_adjust'] = nn.Embedding.from_pretrained(user_emb2, freeze=False)
train_model.embedding_dict['tagids'] = train_model.embedding_dict['keyids']

loop = True
cnt = 0
while loop:
    try:
        cnt += 1
        print('chunk: ', cnt)
        train = data.get_chunk(1000*10000)
        train = train.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']], how='left',
                  on='feedid')
        train = train.merge(tag, how='left', on='feedid')
        train[dense_features] = train[dense_features].fillna(0, )
        train[dense_features] = mms.fit_transform(train[dense_features])
        train['userid'] = userid_map.transform(train['userid'])                

        train_model_input = {name: train[name] for name in feature_names if name not in ['hist_tagids', 'hist_keyids', 'feed_embedding', 'user_embedding_normal', 'user_embedding_adjust', 'tagids', 'keyids']}
        # train_model_input['feed_embedding'] = train_model_input['feedid']
        # val_model_input['feed_embedding'] = val_model_input['feedid']
        train_model_input['tagids'] = train[['manual_tag_' + str(index) for index in range(11)]].values
        train_model_input['keyids'] = train[['manual_key_' + str(index) for index in range(11)]].values
#         train_model_input['tagids'] = train_model_input['userid']
#         train_model_input['keyids'] = train_model_input['userid']
        train_model_input['user_embedding_normal'] = train_model_input['userid']
        train_model_input['user_embedding_adjust'] = train_model_input['userid']
        train_labels = train[target].values

        for epoch in range(epochs):
            history = train_model.fit(train_model_input, train_labels,
                              batch_size=batch_size, epochs=1, verbose=1)
        if not ONLINE_FLAG:
            val_pred_ans = train_model.predict(val_model_input, batch_size=batch_size * 4)
            # 模型predict()返回值格式为(?, 4)，与tf版mmoe不同。因此下方用到了transpose()进行变化。
            evaluate_deepctr(val_labels, val_pred_ans.transpose(), userid_list, target)

    except StopIteration:
        loop=False
        print('Finished all train')

if ONLINE_FLAG:
    t1 = time()
    pred_ans = train_model.predict(test_model_input, batch_size=batch_size * 20)
    pred_ans = pred_ans.transpose()
    t2 = time()
    print('7个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
    ts = (t2 - t1) * 1000.0* 2000 / (len(test)*7.0) 
    print('7个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)

    # # 5.生成提交文件
    for i, action in enumerate(target):
        test[action] = pred_ans[i]
    test['userid'] = userid_map.inverse_transform(test['userid'])
    test[['userid', 'feedid'] + target].to_csv(SUBMIT_DIR + '/mmoe_cin_multi_user_tag_key.csv', index=None, float_format='%.6f')
    print('to_csv ok')

sparse_features:  [SparseFeat(name='bgm_song_id', vocabulary_size=25160, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='bgm_song_id', group_name='default_group'), SparseFeat(name='bgm_singer_id', vocabulary_size=17501, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='bgm_singer_id', group_name='default_group'), SparseFeat(name='userid', vocabulary_size=200000, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='userid', group_name='default_group'), SparseFeat(name='feedid', vocabulary_size=112873, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='feedid', group_name='default_group'), SparseFeat(name='authorid', vocabulary_size=18790, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='authorid', group_name='default_group'), SparseFeat(name='user_embedding_normal', vocabulary_size=200000, embedding_dim=20, use_hash=False, dtype='int32', embedding_name='user_embedding_normal', group_name='default_group'), SparseFe

0it [00:01, ?it/s]


NameError: name 'hist' is not defined

In [8]:
train_model.feature_index

OrderedDict([('bgm_song_id', (0, 1)),
             ('bgm_singer_id', (1, 2)),
             ('userid', (2, 3)),
             ('feedid', (3, 4)),
             ('authorid', (4, 5)),
             ('videoplayseconds', (5, 6)),
             ('hist_tagids', (6, 17)),
             ('tag_seq_len', (17, 18)),
             ('hist_keyids', (18, 29)),
             ('key_seq_len', (29, 30)),
             ('tagids', (30, 31)),
             ('keyids', (31, 32)),
             ('user_embedding_normal', (32, 33)),
             ('user_embedding_adjust', (33, 34))])

In [30]:
3664 - 2089

1575

In [19]:
14464-3664

10800