In [1]:
# ! pip install deepctr==0.8.5 --no-deps
# ! pip install torch==1.7.0 torchvision==0.8.1 
# ! pip install tensorflow-gpu==1.13.1
# ! pip install numba

In [2]:
import sys
sys.path.append('..')
sys.path.append('../../config/')
from config_prosper import *
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from time import time
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
from mytools.utils.myfile import savePkl,loadPkl
from mmoe_tf import MMOE
from evaluation import evaluate_deepctr
from tensorflow.python.keras.utils import multi_gpu_model
from tqdm import tqdm as tqdm


import tensorflow as tf

from deepctr.feature_column import build_input_features, input_from_feature_columns
from deepctr.layers.utils import combined_dnn_input
from deepctr.layers.core import PredictionLayer, DNN

from tensorflow.python.keras.initializers import glorot_normal
from tensorflow.python.keras.layers import Layer

BASE_DIR(目录): /home/tione/notebook


In [3]:
# GPU相关设置
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [4]:
def loadFeedinfo():
    feed = loadPkl(FEED_INFO_DEAL)
    feed[["bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
    feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \
        feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
    feed['bgm_song_id'] = feed['bgm_song_id'].astype('int64')
    feed['bgm_singer_id'] = feed['bgm_singer_id'].astype('int64')
    print('feedinfo loading over...')
    return feed

def getFeedembeddings(df):
    #feedembeddings 降维

    feed_embedding_path = os.path.join(FEATURE_PATH,'feedembedings.pkl')
    feed_embeddings = loadPkl(feed_embedding_path)
    df = df.merge(feed_embeddings,on='feedid',how='left')
    dense = [x for x in list(feed_embeddings.columns) if x != 'feedid' ]
    
    return df,dense

def getSvdembeddings(df):
    dense = []
    #userid-feedid svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
    df = df.merge(svd_embedding,on = ['userid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['userid']]
                            
    #userid_authorid svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
    df  = df.merge(svd_embedding,on = ['userid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['userid']]
    
    #text svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
    svd_embedding['feedid']  = svd_embedding['feedid'].astype(np.int32) 
    df  = df.merge(svd_embedding,on = ['feedid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['feedid']]
    
    return df, dense
def myLeftjoin(left,right,on):
    return left.merge(right[right[on].isin(left[on])].set_index(on),how='left',left_on=on,right_index=True)
def getHistFeatures(df,hist_features):
    dense = [x for x in hist_features.columns if x not in df.columns and  'hist_seq' not in x ]
    varlen = [x for x in hist_features.columns if 'hist_seq' in x]
    df = df.merge(hist_features[hist_features.userid.isin(df.userid.unique())][['userid','feedid','date_','device'] + dense],how = 'left',on = ['userid','feedid','date_','device'])
    return (df,dense)

In [5]:
class myDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data: pd.DataFrame,feedinfo,dnn_feature_columns,batch_size=2048, shuffle=True):
        self.data = data.copy()
        self.target = ACTION_LIST
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(self.data.shape[0])
        
        self.feedinfo = feedinfo
        self.feed_embeddings = loadPkl(os.path.join(FEATURE_PATH,'feedembedings.pkl'))
#         self.user_feed_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
#         self.user_author_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
#         self.text_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
#         self.text_svd_embedding['feedid'] = self.text_svd_embedding['feedid'].astype(int)
        
        self.dnn_feature_columns = dnn_feature_columns
        self.feature_names = get_feature_names(self.dnn_feature_columns)
        
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def __len__(self):

        return (self.data.shape[0] // self.batch_size) + 1

    def __getitem__(self, index):
        batch_indexs = self.indexes[index * self.batch_size:(index + 1) *
                                    self.batch_size]
        batch_data = self.data.iloc[batch_indexs, :]
        
        return self.get_feature_on_batch(batch_data)

    def on_epoch_end(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)
    def on_epoch_begain(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def get_feature_on_batch(self, batch):
        
#         batch = batch.merge(self.user_feed_svd_embedding,on='userid',how='left')
#         batch = batch.merge(self.user_author_svd_embedding,on='userid',how='left')
#         batch = batch.merge(self.text_svd_embedding,on='feedid',how='left')
#         batch = batch.merge(self.feed_embeddings,on='feedid',how='left')
        
        x = {name: batch[name].values for name in self.feature_names}
        for col in ['manual_tag_list','manual_keyword_list','machine_keyword_list']:
            x[col] = np.array(batch[col].tolist())
        y = [batch[y].values for y in ACTION_LIST]
        
        return x,y

In [6]:
DEBUG = False
data = loadPkl(USER_ACTION)
data = data.head(1000000) if DEBUG else data
feedinfo = loadFeedinfo()
# feed_embeddings = loadPkl(os.path.join(FEATURE_PATH,'feedembedings.pkl'))

# user_feed_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
# user_author_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
# text_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
embedding_dim = 8
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id' ]
dense_features = ['videoplayseconds',]
# dense_features += [x for x in list(user_feed_svd_embedding.columns) if x not in ['userid']]
# dense_features += [x for x in list(user_author_svd_embedding.columns) if x not in ['userid']]
# dense_features += [x for x in list(text_svd_embedding.columns) if x not in ['feedid']]
# dense_features += [x for x in list(feed_embeddings.columns) if x not in ['feedid']]
data = data.merge(feedinfo[[
    'feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
    'bgm_singer_id'
] + ['manual_tag_list', 'manual_keyword_list', 'machine_keyword_list'
     ]],
                    how='left',
                    on='feedid')

#dense 特征处理
data['videoplayseconds'] = data['videoplayseconds'].fillna(0, )
data['videoplayseconds'] = np.log(data['videoplayseconds'] + 1.0)
train = data[data.date_ != 14]
val = data[data.date_==14]

feedinfo loading over...


In [7]:

fixlen_feature_columns = [
    SparseFeat(feat,
               vocabulary_size = feedinfo[feat].max() + 1,
               embedding_dim=embedding_dim) for feat in sparse_features if feat !='userid'
] + [DenseFeat(feat, 1) for feat in dense_features
] + [SparseFeat('userid',
               vocabulary_size= data['userid'].max() + 1,
               embedding_dim=embedding_dim)]
tag_columns = [
    VarLenSparseFeat(SparseFeat('manual_tag_list',
                                vocabulary_size=TAG_MAX,
                                embedding_dim=8),
                     maxlen=4)
]
key_words_columns = [
    VarLenSparseFeat(SparseFeat('manual_keyword_list',
                                vocabulary_size=KEY_WORDS_MAX,
                                embedding_dim=16),
                     maxlen=4),
    VarLenSparseFeat(SparseFeat('machine_keyword_list',
                                vocabulary_size=KEY_WORDS_MAX,
                                embedding_dim=16),
                     maxlen=4),
]
dnn_feature_columns =  fixlen_feature_columns + tag_columns + key_words_columns

In [9]:
num_tasks = len(ACTION_LIST)
train_model = MMOE(dnn_feature_columns, 
                   num_tasks=num_tasks,
                   task_types = ['binary' for i in range(num_tasks)],
                   task_names = ACTION_LIST,
                   num_experts=5,
                   multi_head_num = 3,
                   tower_dnn_units_lists = [[16,8] for i in range(num_tasks) ])
train_model.summary()
train_loader = myDataGenerator(train,feedinfo,dnn_feature_columns,batch_size=4096)
val_loader = myDataGenerator(val,feedinfo,dnn_feature_columns,batch_size=4096 * 4,shuffle = False) # shuffle 必须为False
len(train_loader)
train_model = multi_gpu_model(train_model, gpus=2)
optimizer = tf.keras.optimizers.Adagrad(
    lr=0.01, epsilon=1e-07,
)
train_model.compile("adagrad", loss='binary_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
manual_tag_list (InputLayer)    (None, 4)            0                                            
__________________________________________________________________________________________________
manual_keyword_list (InputLayer (None, 4)            0                                            
__________________________________________________________________________________________________
machine_keyword_list (InputLaye (None, 4)            0              

## offline

In [10]:
best_score = -1
early_stop = 1
no_imporove = 0
for epoch in range(5):
    history = train_model.fit(train_loader,
                              epochs=1, verbose=1,workers = 8,use_multiprocessing=True,max_queue_size=100)
    pred_ans = train_model.predict_generator(val_loader)
    pred_ans = np.concatenate(pred_ans,1)
    pred_ans = pd.DataFrame(pred_ans,columns=ACTION_LIST)
    weightauc,uaucs = evaluate_deepctr(val_loader.data[ACTION_LIST],pred_ans,val_loader.data['userid'].values,ACTION_LIST)
    
    if best_score < weightauc:
        best_score = weightauc
        train_model.save_weights(os.path.join(MODEL_PATH,'tf_models/MMOE_offline'))
        no_imporove = 0    
    else :
        no_imporove += 1
    if no_imporove >= early_stop:
        print('-----stoped on epoch %s ------- ' % (epoch))
        break
    

Instructions for updating:
Use tf.cast instead.
  536/18013 [..............................] - ETA: 16:40 - loss: 0.3716 - read_comment_loss: 0.1319 - like_loss: 0.1183 - click_avatar_loss: 0.0489 - forward_loss: 0.0298 - comment_loss: 0.0088 - follow_loss: 0.0104 - favorite_loss: 0.0134Please check the latest version manually on https://pypi.org/project/deepctr/#history
【UAUC：0.6688168684035513】 [0.6442163270878852, 0.6319739517051614, 0.732864862538752, 0.7154539690431767, 0.5770451726320346, 0.7111260719002528, 0.7524771871261724]

Consider using a TensorFlow optimizer from `tf.train`.
【UAUC：0.6714256216871646】 [0.6454274356105509, 0.6325404055900603, 0.7348875058440113, 0.719499716525733, 0.588191412350803, 0.7152915098310446, 0.7564444723251524]

Consider using a TensorFlow optimizer from `tf.train`.
【UAUC：0.6724157020173478】 [0.6452748300278229, 0.6330958680480433, 0.7345311288305045, 0.7205336532994379, 0.5962759783560465, 0.7177223660874967, 0.7574229465661096]

Consider using 

In [11]:
train_model.load_weights(os.path.join(MODEL_PATH,'tf_models/MMOE_offline'))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x7ff8eec48208>

In [12]:
2649

1

In [13]:
no_imporove , weightauc , best_score

(0, 0.6731532994737466, 0.6731532994737466)

# online

In [14]:
data_loader = myDataGenerator(data,feedinfo,dnn_feature_columns,batch_size=4096)
for epoch in range(1):
    history = train_model.fit(data_loader,
                              epochs=1, verbose=1,workers = 8,use_multiprocessing=True,max_queue_size=100)
    pred_ans = train_model.predict_generator(val_loader)
    pred_ans = np.concatenate(pred_ans,1)
    pred_ans = pd.DataFrame(pred_ans,columns=ACTION_LIST)
    weightauc,uaucs = evaluate_deepctr(val_loader.data[ACTION_LIST],pred_ans,val_loader.data['userid'].values,ACTION_LIST)
train_model.save_weights(os.path.join(MODEL_PATH,'tf_models/MMOE_online'))

shuffle data index ing...
【UAUC：0.715024333901372】 [0.6894087927885463, 0.6640912978871467, 0.7687057346141821, 0.777389251111727, 0.6699674490840437, 0.7658881829525023, 0.7947509235255732]

Consider using a TensorFlow optimizer from `tf.train`.


In [15]:
test = pd.read_csv('../../data/wedata/wechat_algo_data2/test_a.csv')
test = test.merge(feedinfo[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']+ ['manual_tag_list','manual_keyword_list','machine_keyword_list']], how='left',on='feedid')
test['videoplayseconds'] = test['videoplayseconds'].fillna(0, )
test['videoplayseconds'] = np.log(test['videoplayseconds'] + 1.0)
test[ACTION_LIST] = 0
t1 = time()
test_loader = myDataGenerator(test,feedinfo,dnn_feature_columns,shuffle=False,batch_size=4096*20)
pred_ans = train_model.predict(test_loader)
t2 = time()
print('7个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
ts = (t2 - t1) * 1000.0 / len(test) * 2000.0
print('7个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)

# 5.生成提交文件
for i, action in enumerate(ACTION_LIST):
    test[action] = pred_ans[i]
test[['userid', 'feedid'] + ACTION_LIST].to_csv(os.path.join(SUMIT_DIR,'tf_mmoe_base4.csv'), index=None, float_format='%.6f')
print('to_csv ok')

7个目标行为4252097条样本预测耗时（毫秒）：8824.591
7个目标行为2000条样本平均预测耗时（毫秒）：4.151
to_csv ok


In [16]:
# 5.生成提交文件
for i, action in enumerate(ACTION_LIST):
    test[action] = pred_ans[i]
test[['userid', 'feedid'] + ACTION_LIST].to_csv(os.path.join(SUMIT_DIR,'tf_mmoe_base3.csv'), index=None, float_format='%.6f')
print('to_csv ok')

to_csv ok


In [17]:
test[['userid', 'feedid'] + ACTION_LIST]

Unnamed: 0,userid,feedid,read_comment,like,click_avatar,forward,comment,follow,favorite
0,175282,50458,0.024931,0.010128,0.001030,0.025858,7.793009e-04,0.000272,6.718934e-04
1,80036,42329,0.008010,0.004921,0.012007,0.001349,3.859997e-04,0.001910,6.186068e-04
2,145791,85242,0.000433,0.008639,0.000319,0.000313,1.952052e-05,0.000012,1.084805e-05
3,28430,9425,0.005326,0.004971,0.043480,0.033652,2.826899e-03,0.025757,5.800337e-03
4,44393,11866,0.000025,0.001973,0.000075,0.000180,3.576279e-07,0.000002,8.642673e-07
...,...,...,...,...,...,...,...,...,...
4252092,153322,51633,0.000334,0.005208,0.003492,0.002972,7.057190e-05,0.000343,5.510747e-04
4252093,39430,20147,0.003300,0.007864,0.000141,0.000056,1.356006e-05,0.000006,3.129244e-06
4252094,2524,89043,0.000507,0.018232,0.002092,0.000521,3.284216e-05,0.000200,2.564192e-04
4252095,69629,27238,0.033721,0.006551,0.018737,0.000892,1.761019e-04,0.001717,1.875758e-04
