In [None]:
# ! pip install deepctr==0.8.7 --no-deps
# ! pip install torch==1.7.0 torchvision==0.8.1 
# ! pip install tensorflow-gpu==1.13.1
# ! pip install numba

In [2]:
import sys
sys.path.append('..')
sys.path.append('../../config/')
from config_prosper import *
import os
import gc
import pandas as pd
import numpy as np
import tensorflow as tf

from time import time
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat,build_input_features,input_from_feature_columns

from mytools.utils.myfile import savePkl,loadPkl
from mmoe_tf import MMOE,MMOE_FefM,MMOE_mutihead,Shared_Bottom
from evaluation import evaluate_deepctr
from tensorflow.python.keras.utils import multi_gpu_model
from tqdm import tqdm as tqdm
import warnings
import tensorflow as tf
print(tf.test.is_gpu_available())

BASE_DIR(目录): /home/tione/notebook
True


In [3]:
# GPU相关设置
warnings.filterwarnings('ignore')
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
SEED = 100

In [4]:
def loadFeedinfo():
    feed = loadPkl(FEED_INFO_DEAL)
    feed[["bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
    feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \
        feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
    feed['bgm_song_id'] = feed['bgm_song_id'].astype('int64')
    feed['bgm_singer_id'] = feed['bgm_singer_id'].astype('int64')
    print('feedinfo loading over...')
    return feed
def myLeftjoin(left,right,on):
    return left.merge(right[right[on].isin(left[on])].set_index(on),how='left',left_on=on,right_index=True)


In [5]:
class myDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data: pd.DataFrame,batch_size=2048, shuffle=True,mode = 'train'):
        
        
        assert mode == 'train' or mode == 'test'
        if mode == 'test' and shuffle == True :
            raise ValueError('测试数据打乱了！')
            
        self.data = data.copy()
        self.data = self.data.reset_index(drop = True)
        self.target = ACTION_LIST
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(self.data.shape[0])
        self.feedinfo = loadFeedinfo()
        self.sparse_features = list(set(['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id' 
                  ] +  [x for x in self.feedinfo.columns if 'manual_tag_list' in x 
                  ] + [x for x in self.feedinfo.columns if 'manual_keyword_list' in x 
                  ] + [x for x in self.feedinfo.columns if 'machine_keyword_list' in x]))
        
        self.var_len_features = ['manual_tag_list', 'manual_keyword_list', 'machine_keyword_list'] 
        self.dense_features = ['videoplayseconds',]
        
        

        # dense 特征处理
#         self.data['videoplayseconds'] = self.data['videoplayseconds'].fillna(0,)
#         self.data['videoplayseconds'] = np.log(self.data['videoplayseconds'] + 1.0)
        

#         self.feed_embeddings = loadPkl(os.path.join(FEATURE_PATH,'feedembedings.pkl'))
#         self.user_feed_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
#         self.user_author_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
#         self.text_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
#         self.text_svd_embedding['feedid'] = self.text_svd_embedding['feedid'].astype(int)

        self.graph_emb8 = loadPkl(os.path.join(MODEL_PATH,'emb/graph_walk_emb_8.pkl'))
        self.feed_emb_16 = loadPkl(os.path.join(MODEL_PATH,'emb/feed_embeddings_16.pkl'))
        self.weight_emb8 = loadPkl(os.path.join(MODEL_PATH,'emb/user_weight_emd_8.pkl'))
        self.weight_emb8 = self.weight_emb8.drop('user_date_weight_emd',axis = 1)
        self.keyword_w2v_8 = loadPkl(os.path.join(MODEL_PATH,'emb/keyword_w2v_8.pkl'))
        self.userid_feedid_d2v_all_16 = loadPkl(os.path.join(MODEL_PATH,'emb/userid_feedid_d2v_all_16.pkl'))##加了初赛数据
        self.all_text_data_v8 = loadPkl(os.path.join(MODEL_PATH,'emb/all_text_data_v8.pkl'))
        self.userid_authorid_d2v_all_16 = loadPkl(os.path.join(MODEL_PATH,'emb/userid_authorid_d2v_all_16.pkl'))
        
        if mode == 'train':
            self.dnn_feature_columns = self.getFeatureColumns()
            self.feature_names = get_feature_names(self.dnn_feature_columns)
            self.feature_index = build_input_features(self.dnn_feature_columns)
            savePkl(self.dnn_feature_columns,os.path.join(MODEL_PATH,'feature_columns_all.pkl'))
            print('feature columns have saved')
        else :
            self.dnn_feature_columns = loadPkl(os.path.join(MODEL_PATH,'feature_columns_all.pkl'))
            self.feature_names = get_feature_names(self.dnn_feature_columns)
            self.feature_index = build_input_features(self.dnn_feature_columns)
            print('load feature columns' ,os.path.join(MODEL_PATH,'feature_columns_all.pkl'))
        
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def __len__(self):

        return (self.data.shape[0] // self.batch_size) + 1

    def __getitem__(self, index):
        batch_indexs = self.indexes[index * self.batch_size:(index + 1) *
                                    self.batch_size]
        batch_data = self.data.iloc[batch_indexs, :]
        
        return self.get_feature_on_batch(batch_data)

    def on_epoch_end(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)
    def on_epoch_begain(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def get_feature_on_batch(self, batch):
        
#         batch = batch.merge(self.user_feed_svd_embedding,on='userid',how='left')
#         batch = batch.merge(self.user_author_svd_embedding,on='userid',how='left')
#         batch = batch.merge(self.text_svd_embedding,on='feedid',how='left')
#         batch = batch.merge(self.feed_embeddings,on='feedid',how='left')
        import time
        t = time.time()
        batch = batch.merge(self.graph_emb8, how='left',
              on='userid')
        batch = batch.merge(self.feed_emb_16, how='left',
                      on='feedid')
        batch = batch.merge(self.weight_emb8, how='left',
                      on='userid')
        batch = batch.merge(self.keyword_w2v_8, how='left',
                      on='feedid')
        batch = batch.merge(self.userid_feedid_d2v_all_16, how='left',
                      on='userid')
        batch = batch.merge(self.all_text_data_v8, how='left',
                      on='feedid')
        batch = batch.merge(self.userid_authorid_d2v_all_16, how='left',
                      on='userid')
        batch = batch.merge(self.feedinfo[[ x for x in self.feedinfo.columns if x in self.var_len_features + self.sparse_features + self.dense_features]],
                            how='left',
                            on='feedid')             
#         print('get batch cost time: %s' % (time.time() - t))
        x = {name: batch[name].values for name in self.feature_names}
        for col in ['manual_tag_list','manual_keyword_list','machine_keyword_list']:
            x[col] = np.array(batch[col].tolist())
        y = [batch[y].values for y in ACTION_LIST]
#         print('get batch cost time: %s' % (time.time() - t))
        return x,y
        
    def getFeatureColumns(self,):
        embedding_dim = 16
        sparse_features = [ x for x in self.sparse_features if '_list' not in x] #排除变长特征的单独列
        dense_features = self.dense_features 
         
        
        ###dense
        for df in [
                self.graph_emb8, 
                self.feed_emb_16, 
                self.weight_emb8,
                self.keyword_w2v_8, 
                self.userid_feedid_d2v_all_16,
                self.all_text_data_v8, 
                self.userid_authorid_d2v_all_16
        ]:
            dense_features += [
                x for x in df.columns if x not in ['userid', 'feedid']
            ]
            
        ### user id  and varlen
        userid_columns = [
            SparseFeat('userid',
                       vocabulary_size=USERID_MAX,
                       embedding_dim=embedding_dim)
        ]
        
        tag_columns = [
            VarLenSparseFeat(SparseFeat('manual_tag_list',
                                        vocabulary_size=TAG_MAX,
                                        embedding_dim=embedding_dim),
                             maxlen=4)
        ]
        
        key_words_columns = [
            VarLenSparseFeat(SparseFeat('manual_keyword_list',
                                        vocabulary_size=KEY_WORDS_MAX,
                                        embedding_dim=embedding_dim),
                             maxlen=4),
            VarLenSparseFeat(SparseFeat('machine_keyword_list',
                                        vocabulary_size=KEY_WORDS_MAX,
                                        embedding_dim=embedding_dim),
                             maxlen=4),
        ]
        
        # sparse
        fixlen_feature_columns = [
            SparseFeat(feat,
                       vocabulary_size=self.feedinfo[feat].max() + 1,
                       embedding_dim=embedding_dim) for feat in sparse_features
            if feat !='userid'
        ] + [SparseFeat('manual_tag_list' + str(x),
                       vocabulary_size=TAG_MAX ,
                       embedding_dim=embedding_dim) for x in range(4)  # 
        ] + [SparseFeat('manual_keyword_list' + str(x),
                       vocabulary_size=KEY_WORDS_MAX,
                       embedding_dim=embedding_dim) for x in range(4)
        ] + [SparseFeat('machine_keyword_list' + str(x),
                       vocabulary_size=KEY_WORDS_MAX,
                       embedding_dim=embedding_dim) for x in range(4)
        ]
        
        
        ### dense feature
        dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

        dnn_feature_columns = fixlen_feature_columns + tag_columns + key_words_columns + dense_feature_columns + userid_columns
        return dnn_feature_columns

In [6]:
def get_Shared_Bottom(dnn_feature_columns):
    num_tasks = len(ACTION_LIST)
    train_model = Shared_Bottom(
                       dnn_feature_columns=dnn_feature_columns,
                       num_tasks=num_tasks,
                       bottom_dnn_units=[512,512],
                       task_types = ['binary' for i in range(num_tasks)],
                       task_names = ACTION_LIST,
                       tower_dnn_units_lists = [[64,32] for i in range(num_tasks) ],
    )
#     train_model.summary()
#     len(train_loader)
#     train_model = multi_gpu_model(train_model, gpus=2)
#     optimizer = tf.keras.optimizers.Adagrad(
#         lr=0.05, epsilon=1e-07,
#     )
    train_model.compile('adagrad', loss='binary_crossentropy')
    return train_model

def get_MMOE_FEFM(dnn_feature_columns):
    num_tasks = len(ACTION_LIST)
    train_model = MMOE_FefM(
                   dnn_feature_columns=dnn_feature_columns,
                   num_tasks=num_tasks,
                   task_types = ['binary' for i in range(num_tasks)],
                   task_names = ACTION_LIST,
                   num_experts=7,
                   tower_dnn_units_lists = [[64,32] for i in range(num_tasks) ],
                   dnn_hidden_units=(512, 512),
                   expert_dim=32,)
    train_model.compile('adagrad', loss='binary_crossentropy')
    return train_model

def get_MMOE_MutiHead(dnn_feature_columns):
    num_tasks = len(ACTION_LIST)
    train_model = MMOE_mutihead(dnn_feature_columns, 
                   num_tasks=num_tasks,
                   task_types = ['binary' for i in range(num_tasks)],
                   task_names = ACTION_LIST,
                   num_experts=7,
                   tower_dnn_units_lists = [[64,32] for i in range(num_tasks) ],
                   dnn_hidden_units=(512, 512),
                   expert_dim=32,
                   multi_head_num = 3,
                  )
    train_model.compile('adagrad', loss='binary_crossentropy')
    return train_model


def trainer(train_model,train_loader,val_loader,epochs,model_path,load_model = False):
    if load_model:
        train_model.load_weights(model_path)
        print('load weights from %s success!' ,model_path)
    epochs = 1 if DEBUG else epochs
    best_score = -1
    early_stop = 1
    no_imporove = 0
    print('run...')
    for epoch in range(epochs):
        history = train_model.fit(train_loader,
                                  epochs=1, verbose=1,workers = 8,use_multiprocessing=True,max_queue_size=20)
        pred_ans = train_model.predict_generator(val_loader)
        pred_ans = np.concatenate(pred_ans,1)
        pred_ans = pd.DataFrame(pred_ans,columns=ACTION_LIST)
        weightauc,uaucs = evaluate_deepctr(val_loader.data[ACTION_LIST],pred_ans,val_loader.data['userid'].values,ACTION_LIST)
        if best_score < weightauc:
            best_score = weightauc
            train_model.save_weights(model_path)
            no_imporove = 0    
        else :
            no_imporove += 1
        if no_imporove >= early_stop:
            print('-----stoped on epoch %s ------- ' % (epoch))
            break
    del train_model
    gc.collect()

## offline

In [7]:
DEBUG = False
data = loadPkl(USER_ACTION)
data = data.head(10000000) if DEBUG else data

train = data[data.date_ != 14]
val = data[data.date_ ==14]

train_loader = myDataGenerator(train,batch_size=4096,mode='train')
val_loader = myDataGenerator(val,batch_size=4096 * 4,shuffle = False,mode='test') # shuffle 必须为False
data_loader = myDataGenerator(data,batch_size=4096,mode = 'train')
dnn_feature_columns = train_loader.dnn_feature_columns

feedinfo loading over...
feature columns have saved
shuffle data index ing...
feedinfo loading over...
load feature columns /home/tione/notebook/src/model/feature_columns_all.pkl
feedinfo loading over...
feature columns have saved
shuffle data index ing...


In [8]:
model = get_Shared_Bottom(dnn_feature_columns)
trainer(train_model=model, 
        train_loader=train_loader, 
        val_loader=val_loader, 
        epochs=5,
        model_path=os.path.join(MODEL_PATH, 'tf_models/share_bottom/model_seed%s' % (SEED)), 
        load_model=False)

trainer(train_model=model, 
        train_loader=data_loader, 
        val_loader=val_loader, 
        epochs=1,
        model_path=os.path.join(MODEL_PATH, 'tf_models/share_bottom/model_seed%s' % (SEED)), 
        load_model=True)
del model
gc.collect()

(?, 441)
run...
 1032/18013 [>.............................] - ETA: 20:33 - loss: 0.3153 - read_comment_loss: 0.1115 - like_loss: 0.1063 - click_avatar_loss: 0.0408 - forward_loss: 0.0236 - comment_loss: 0.0044 - follow_loss: 0.0064 - favorite_loss: 0.0099Please check the latest version manually on https://pypi.org/project/deepctr/#history
【UAUC：0.6757386159546404】 [0.648873681780666, 0.636559524295809, 0.7344300748916343, 0.7123027853804781, 0.6148019020572298, 0.7254138514090799, 0.7580500187701781]
【UAUC：0.679562846105203】 [0.6499696386040911, 0.6412158728358297, 0.7373617916629019, 0.7211492515852386, 0.6218241192030254, 0.7312517650715787, 0.7618421072581396]
【UAUC：0.6811825120607657】 [0.6509333973682085, 0.6419097640028734, 0.7389046945555691, 0.7234632343882255, 0.6239568489473829, 0.7344333976096872, 0.7662469052520682]
【UAUC：0.6821020708965885】 [0.6514191342890899, 0.6434581795446119, 0.7395441452078749, 0.7259963179729161, 0.6257038501029598, 0.7332007373250147, 0.76728665004

3999

In [9]:
model = get_MMOE_FEFM(dnn_feature_columns)
trainer(train_model=model, 
        train_loader=train_loader, 
        val_loader=val_loader, 
        epochs=5,
        model_path=os.path.join(MODEL_PATH, 'tf_models/MMOE_FEFM/model_seed%s' % (SEED)), 
        load_model=False)

data_loader =  myDataGenerator(data,batch_size=4096,mode='train')
trainer(train_model=model, 
        train_loader=data_loader, 
        val_loader=val_loader, 
        epochs=1,
        model_path=os.path.join(MODEL_PATH, 'tf_models/MMOE_FEFM/model_seed%s' % (SEED)), 
        load_model=True)
del model
gc.collect()

dnn input shape (?, 631)
run...
【UAUC：0.6748156382531703】 [0.6470347057456309, 0.6361064764892721, 0.7330581243010069, 0.7144073603642603, 0.6156334540210163, 0.723760418151552, 0.7562275637020318]
【UAUC：0.6800269772478583】 [0.6510471160818493, 0.640478883562879, 0.7382783153497652, 0.7226600796585096, 0.6194604490041427, 0.7318757527865146, 0.7641726770574278]
【UAUC：0.6808588083224175】 [0.6518854055101205, 0.6413012802225531, 0.7397465796736771, 0.7264320028977703, 0.6144596467375549, 0.7333005516475218, 0.7660336848530852]
【UAUC：0.6816369344604346】 [0.6508266508542827, 0.6424857503817742, 0.7400733688210708, 0.7254429284051194, 0.6205611421399627, 0.7354407103028805, 0.768924774933092]
【UAUC：0.6827323927006403】 [0.6514865708170201, 0.643338319590411, 0.7407845749938212, 0.7259359579119418, 0.6249861524698803, 0.7382981029167124, 0.7687704997828354]
feedinfo loading over...
feature columns have saved
shuffle data index ing...
load weights from %s success! /home/tione/notebook/src/mode

6318

In [10]:
model = get_MMOE_MutiHead(dnn_feature_columns)
trainer(train_model=model, 
        train_loader=train_loader, 
        val_loader=val_loader, 
        epochs=5,
        model_path=os.path.join(MODEL_PATH, 'tf_models/MMOE_MutiHead/model_seed%s' % (SEED)), 
        load_model=False)

trainer(train_model=model, 
        train_loader=data_loader, 
        val_loader=val_loader, 
        epochs=1,
        model_path=os.path.join(MODEL_PATH, 'tf_models/MMOE_MutiHead/model_seed%s' % (SEED)), 
        load_model=True)
del model
gc.collect()

dnn input shape (?, 441)
run...
shuffle data index ing...
【UAUC：0.6750931202234038】 [0.6484409755037416, 0.6364666901003417, 0.7334541609204336, 0.7154858631683588, 0.6156687568067091, 0.7220715080664079, 0.7529121407059143]
【UAUC：0.6794813835129887】 [0.6503264971931634, 0.6409647742654809, 0.7365479846289802, 0.7238791787479897, 0.6232238917466122, 0.7282354168182583, 0.7606232175289364]
【UAUC：0.6806075781274679】 [0.6512953338528548, 0.6418066111216622, 0.7385482020078451, 0.7254729442769028, 0.6216876016285523, 0.7305331879655977, 0.7625072089939343]
【UAUC：0.6816308248750063】 [0.6507960902248635, 0.6425447536158542, 0.7395323603703333, 0.7264946815352933, 0.6293272705871811, 0.7317606438686602, 0.7637347848962627]
【UAUC：0.6822756836375596】 [0.6513834119540877, 0.6428767559270716, 0.7397088662771112, 0.7274923731503128, 0.6327942173824372, 0.7311882430657901, 0.7645274055379466]
load weights from %s success! /home/tione/notebook/src/model/tf_models/MMOE_MutiHead/model_seed100
run...
【

3803

# online

In [11]:
def infer(test_loader,model,model_weights_path,):
    t1 = time.time()
    sub = test_loader.data.copy()
    model.load_weights(model_weights_path)
    print('model weights load from %s' % (model_weights_path))
    pred_ans = model.predict(test_loader,workers = 4,use_multiprocessing=True,max_queue_size=200)
    for i, action in enumerate(ACTION_LIST):
        sub[action] = pred_ans[i]
    t2 = time.time()
    print('7个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
    ts = (t2 - t1) * 1000.0 / len(test) * 2000.0
    print('7个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)
    return sub[['userid', 'feedid'] + ACTION_LIST]

In [12]:
import time
if __name__ == "__main__":
    argv = sys.argv
    argv = ['python','submit','../../data/wedata/wechat_algo_data2/test_a.csv']
#     params = xdeepfm_params
    t = time.time() 
    stage = argv[1]
    print('Stage: %s'%stage)
    test_path = ''
    if len(argv)==3:
        test_path = argv[2]
        t1 = time.time()
        test = pd.read_csv(test_path)
        test[ACTION_LIST] = 0
        test_loader = myDataGenerator(test,shuffle=False,batch_size=4096*40,mode ='test')
        dnn_feature_columns = test_loader.dnn_feature_columns
        print('Get test input cost: %.4f s'%(time.time()-t1))
    
    eval_dict = {}
    predict_dict = {}
    predict_time_cost = {}
    ids = None
    
    print('开始预测share bottom...')
    share_bottom_model = get_Shared_Bottom(dnn_feature_columns)
    submission1 = infer(test_loader,share_bottom_model,os.path.join(MODEL_PATH,'tf_models/share_bottom/model_seed%s' % (SEED)))
    
    print('开始预测MMOE FEFM...')
    mmoe_fefm_model = get_MMOE_FEFM(dnn_feature_columns)
    submission2 = infer(test_loader,mmoe_fefm_model,os.path.join(MODEL_PATH,'tf_models/MMOE_FEFM/model_seed%s' % (SEED)))
    
    print('开始预测MMOE MUTI_HEAD...')
    mmoe_multihead_model = get_MMOE_MutiHead(dnn_feature_columns)
    submission3 = infer(test_loader,mmoe_multihead_model,os.path.join(MODEL_PATH,'tf_models/MMOE_MutiHead/model_seed%s' % (SEED)))
    
#     print('开始预测MMOE FEFM...')
#     mmoe_fefm_model = get_MMOE_FEFM(dnn_feature_columns)
#     submission2 = infer(test_loader,mmoe_fefm_model,os.path.join(MODEL_PATH,'tf_models/MMOE_FEFM/model_seed%s' % (SEED)))
    
    
    print('Time cost: %.2f s'%(time.time()-t))

Stage: submit
feedinfo loading over...
load feature columns /home/tione/notebook/src/model/feature_columns_all.pkl
Get test input cost: 4.0133 s
开始预测share bottom...
(?, 441)
model weights load from /home/tione/notebook/src/model/tf_models/share_bottom/model_seed100
7个目标行为4252097条样本预测耗时（毫秒）：71620.384
7个目标行为2000条样本平均预测耗时（毫秒）：33.687
开始预测MMOE FEFM...
dnn input shape (?, 631)
model weights load from /home/tione/notebook/src/model/tf_models/MMOE_FEFM/model_seed100
7个目标行为4252097条样本预测耗时（毫秒）：95479.959
7个目标行为2000条样本平均预测耗时（毫秒）：44.910
开始预测MMOE MUTI_HEAD...
dnn input shape (?, 441)
model weights load from /home/tione/notebook/src/model/tf_models/MMOE_MutiHead/model_seed100
7个目标行为4252097条样本预测耗时（毫秒）：111828.876
7个目标行为2000条样本平均预测耗时（毫秒）：52.599
Time cost: 311.79 s


In [13]:
submission1.to_csv(os.path.join(SUMIT_DIR,'share_bottom.csv'),index=None)
submission2.to_csv(os.path.join(SUMIT_DIR,'MMOE_FEFM.csv'),index=None)
submission3.to_csv(os.path.join(SUMIT_DIR,'MMOE_MutiHead.csv'),index=None)