In [1]:
# ! pip install deepctr==0.8.5 --no-deps
# ! pip install torch==1.7.0 torchvision==0.8.1 
# ! pip install tensorflow-gpu==1.13.1
# ! pip install numba

In [2]:
import sys
sys.path.append('..')
sys.path.append('../../config/')
from config_prosper import *
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from time import time
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names,VarLenSparseFeat
from mytools.utils.myfile import savePkl,loadPkl
from mmoe_tf import MMOE
from evaluation import evaluate_deepctr
from tensorflow.python.keras.utils import multi_gpu_model
from tqdm import tqdm as tqdm

BASE_DIR(目录): /home/tione/notebook


In [3]:
# GPU相关设置
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [4]:
def loadFeedinfo():
    feed = loadPkl(FEED_INFO_DEAL)
    feed[["bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
    feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \
        feed[["bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
    feed['bgm_song_id'] = feed['bgm_song_id'].astype('int64')
    feed['bgm_singer_id'] = feed['bgm_singer_id'].astype('int64')
    print('feedinfo loading over...')
    return feed

def getFeedembeddings(df):
    #feedembeddings 降维

    feed_embedding_path = os.path.join(FEATURE_PATH,'feedembedings.pkl')
    feed_embeddings = loadPkl(feed_embedding_path)
    df = df.merge(feed_embeddings,on='feedid',how='left')
    dense = [x for x in list(feed_embeddings.columns) if x != 'feedid' ]
    
    return df,dense

def getSvdembeddings(df):
    dense = []
    #userid-feedid svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
    df = df.merge(svd_embedding,on = ['userid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['userid']]
                            
    #userid_authorid svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
    df  = df.merge(svd_embedding,on = ['userid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['userid']]
    
    #text svd
    svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
    svd_embedding['feedid']  = svd_embedding['feedid'].astype(np.int32) 
    df  = df.merge(svd_embedding,on = ['feedid'],how='left')
    dense += [x for x in list(svd_embedding.columns) if x not in ['feedid']]
    
    return df, dense
def myLeftjoin(left,right,on):
    return left.merge(right[right[on].isin(left[on])].set_index(on),how='left',left_on=on,right_index=True)
def getHistFeatures(df,hist_features):
    dense = [x for x in hist_features.columns if x not in df.columns and  'hist_seq' not in x ]
    varlen = [x for x in hist_features.columns if 'hist_seq' in x]
    df = df.merge(hist_features[hist_features.userid.isin(df.userid.unique())][['userid','feedid','date_','device'] + dense],how = 'left',on = ['userid','feedid','date_','device'])
    return (df,dense)

In [5]:
class myDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data: pd.DataFrame,feedinfo,dnn_feature_columns,batch_size=2048, shuffle=True):
        self.data = data.copy()
        self.target = ACTION_LIST
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(self.data.shape[0])
        
        self.feedinfo = feedinfo
        self.user_feed_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
        self.user_author_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
        self.text_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
        self.text_svd_embedding['feedid'] = self.text_svd_embedding['feedid'].astype(int)
        self.dnn_feature_columns = dnn_feature_columns
        self.feature_names = get_feature_names(self.dnn_feature_columns)
        
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def __len__(self):

        return (self.data.shape[0] // self.batch_size) + 1

    def __getitem__(self, index):
        batch_indexs = self.indexes[index * self.batch_size:(index + 1) *
                                    self.batch_size]
        batch_data = self.data.iloc[batch_indexs, :]
        
        return self.get_feature_on_batch(batch_data)

    def on_epoch_end(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)
    def on_epoch_begain(self):
        if self.shuffle:
            print('shuffle data index ing...')
            np.random.shuffle(self.indexes)

    def get_feature_on_batch(self, batch):
        
        batch = batch.merge(self.user_feed_svd_embedding,on='userid',how='left')
        batch = batch.merge(self.user_author_svd_embedding,on='userid',how='left')
        batch = batch.merge(self.text_svd_embedding,on='feedid',how='left')
        
        x = {name: batch[name].values for name in self.feature_names}
        for col in ['manual_tag_list','manual_keyword_list','machine_keyword_list']:
            x[col] = np.array(batch[col].tolist())
        y = [batch[y].values for y in ACTION_LIST]
        
        return x,y

In [6]:
DEBUG = True
data = loadPkl(USER_ACTION)
data = data.head(1000000) if DEBUG else data
feedinfo = loadFeedinfo()
user_feed_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_feedid_embedding.pkl'))
user_author_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'svd_userid_authorid_embedding.pkl'))
text_svd_embedding = loadPkl(os.path.join(FEATURE_PATH,'texts_svd_embedding.pkl'))
embedding_dim = 8
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id' ]
dense_features = ['videoplayseconds',]
dense_features += [x for x in list(user_feed_svd_embedding.columns) if x not in ['userid']]
dense_features += [x for x in list(user_author_svd_embedding.columns) if x not in ['userid']]
dense_features += [x for x in list(text_svd_embedding.columns) if x not in ['feedid']]
data = data.merge(feedinfo[[
    'feedid', 'authorid', 'videoplayseconds', 'bgm_song_id',
    'bgm_singer_id'
] + ['manual_tag_list', 'manual_keyword_list', 'machine_keyword_list'
     ]],
                    how='left',
                    on='feedid')

#dense 特征处理
data['videoplayseconds'] = data['videoplayseconds'].fillna(0, )
data['videoplayseconds'] = np.log(data['videoplayseconds'] + 1.0)
train = data[data.date_ != 14]
val = data[data.date_==14]

feedinfo loading over...


In [7]:

fixlen_feature_columns = [
    SparseFeat(feat,
               vocabulary_size = feedinfo[feat].max() + 1,
               embedding_dim=embedding_dim) for feat in sparse_features if feat !='userid'
] + [DenseFeat(feat, 1) for feat in dense_features
] + [SparseFeat('userid',
               vocabulary_size= data['userid'].max() + 1,
               embedding_dim=embedding_dim)]
tag_columns = [
    VarLenSparseFeat(SparseFeat('manual_tag_list',
                                vocabulary_size=TAG_MAX,
                                embedding_dim=8),
                     maxlen=4)
]
key_words_columns = [
    VarLenSparseFeat(SparseFeat('manual_keyword_list',
                                vocabulary_size=KEY_WORDS_MAX,
                                embedding_dim=16),
                     maxlen=4),
    VarLenSparseFeat(SparseFeat('machine_keyword_list',
                                vocabulary_size=KEY_WORDS_MAX,
                                embedding_dim=16),
                     maxlen=4),
]
dnn_feature_columns =  fixlen_feature_columns + tag_columns + key_words_columns

In [8]:
num_tasks = len(ACTION_LIST)
train_model = MMOE(dnn_feature_columns, num_tasks=num_tasks,task_types = ['binary' for i in range(num_tasks)],task_names = ACTION_LIST,num_experts=5,tower_dnn_units_lists = [[16,8] for i in range(num_tasks) ])
# train_model.summary()
train_loader = myDataGenerator(train,feedinfo,dnn_feature_columns,batch_size=4096)
val_loader = myDataGenerator(val,feedinfo,dnn_feature_columns,batch_size=4096 * 4,shuffle = False) # shuffle 必须为False
len(train_loader)
train_model = multi_gpu_model(train_model, gpus=2)
optimizer = tf.keras.optimizers.Adagrad(
    lr=0.01, epsilon=1e-07,
)
train_model.compile("adagrad", loss='binary_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
shuffle data index ing...


In [9]:
best_score = -1
early_stop = 1
no_imporove = 0
for epoch in range(10):
    history = train_model.fit(train_loader,
                              epochs=1, verbose=1,workers = 8,use_multiprocessing=True,max_queue_size=50)
    pred_ans = train_model.predict_generator(val_loader)
    pred_ans = np.concatenate(pred_ans,1)
    pred_ans = pd.DataFrame(pred_ans,columns=ACTION_LIST)
    weightauc,uaucs = evaluate_deepctr(val_loader.data[ACTION_LIST],pred_ans,val_loader.data['userid'].values,ACTION_LIST)
    
    if best_score < weightauc:
        best_score = weightauc
        train_model.save(os.path.join(MODEL_PATH,'tf_models/MMOE'))
        no_imporove = 0    
    else :
        no_imporove += 1
    if no_imporove >= early_stop:
        print('-----stoped on epoch %s ------- ' % (epoch))
        break
    

Instructions for updating:
Use tf.cast instead.
Weighted uAUC:  (0.6003140039059883, [0.5863341814663644, 0.5845323787571292, 0.6712884723628464, 0.540491346556399, 0.6558612797377892, 0.5376930479155573, 0.6285255697055652])
Weighted uAUC:  (0.6176179914021787, [0.6014899074304247, 0.602993406304328, 0.6783747935563645, 0.6059868464756202, 0.6770513588984823, 0.5498011506803312, 0.6245050964264789])
Weighted uAUC:  (0.6165445301817929, [0.5965897780517457, 0.5913614932506234, 0.679887530976639, 0.6492626464950926, 0.6852745200222693, 0.5326967994299628, 0.6276262725038523])
-----stoped on epoch 2 ------- 


In [10]:
1

1

In [11]:
--

SyntaxError: invalid syntax (<ipython-input-11-4071a1fba099>, line 1)

In [None]:
test = pd.read_csv('../../data/wedata/wechat_algo_data2/test_a.csv')
test = test.merge(feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']+ ['manual_tag_list','manual_keyword_list','machine_keyword_list']], how='left',on='feedid')
test[dense_features] = test[dense_features].fillna(0, )
test[dense_features] = np.log(test[dense_features] + 1.0)

t1 = time()
test_model_input = {name: test[name] for name in feature_names}
for col in tqdm(['manual_tag_list','manual_keyword_list','machine_keyword_list'] ):
    test_model_input[col] = np.array(test[col].tolist())
pred_ans = train_model.predict(test_model_input, batch_size=batch_size * 20)
t2 = time()
print('7个目标行为%d条样本预测耗时（毫秒）：%.3f' % (len(test), (t2 - t1) * 1000.0))
ts = (t2 - t1) * 1000.0 / len(test) * 2000.0
print('7个目标行为2000条样本平均预测耗时（毫秒）：%.3f' % ts)

# 5.生成提交文件
for i, action in enumerate(target):
    test[action] = pred_ans[i]
test[['userid', 'feedid'] + target].to_csv(os.path.join(SUMIT_DIR,'tf_mmoe_base2.csv'), index=None, float_format='%.6f')
print('to_csv ok')

In [None]:
test[['userid', 'feedid'] + target]

Please check the latest version manually on https://pypi.org/project/deepctr/#history
