In [2]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from deepctr.models import DeepFM,WDL,xDeepFM,AutoInt
from tensorflow.python.keras.optimizers import Adam,Adagrad
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from tensorflow.python.keras.models import save_model,load_model
from deepctr.layers import custom_objects

from evaluation_v2 import uAUC,compute_weighted_score

In [3]:
# 存储数据的根目录
ROOT_PATH = "/testcbd017_gujinfang/GJFCode/WeChat_2021/Code/data"
TEST_FILE=ROOT_PATH+'/wechat_algo_data1/test_a_concat.csv'
SUB_PATH=ROOT_PATH+'/submit'

In [4]:
# GPU相关设置
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 设置GPU按需增长
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [5]:
ACTION_LIST = ["read_comment", "like", "click_avatar",  "forward"]
FEA_COLUMN_LIST = ["read_comment", "like", "click_avatar",  "forward", "comment", "follow", "favorite"]

Please check the latest version manually on https://pypi.org/project/deepctr/#history


In [12]:
epochs=10
batch_size=2048
embedding_dim=16
model_name='wdl'

In [7]:
def mkdir(path):
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)       
    else:
        return

In [8]:
# 读取某个action的sample_conat数据；最后一天为val，其他为train 
def get_df_data(action,day=14):
    df=pd.read_csv('{}/generater_data/{}_{}_concat_sample.csv'.format(ROOT_PATH,action,day))
#     return df,df[df['date_']<day],df[df['date_']==day]
    return pd.DataFrame(df)

#### 不同的action其test数据是一样的

In [9]:
test=pd.read_csv(TEST_FILE)
# test.head(5)

In [80]:
# test_model_input = {name:test[name].values for name in feature_names}

#### 为每个action构造单独的Model

In [14]:
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id',\
                   'watch_count_group','video_time_group']

dense_features = ['videoplayseconds',"watch_count","play_times"]+\
[b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]

dense_noliner_cols=['videoplayseconds',"watch_count"]+[b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]

dense_features=dense_features+[b+'_log' for b in dense_noliner_cols]+\
[b+'_square' for b in dense_noliner_cols]+[b+'_exp' for b in dense_noliner_cols]


## TODO 对于dense列 进行非线性扩展 x^2 logx e^x...

sub_predict=test[['userid', 'feedid']]
best_auc=dict(zip(ACTION_LIST,[0.0]*len(ACTION_LIST)))
predict_best=dict(zip(ACTION_LIST,[]*len(ACTION_LIST)))# 保存最好auc时的预测结果

for action in ACTION_LIST:
    print('******************{}********************'.format(action))
    # 1 读取 train val 数据集
    df=get_df_data(action,day=14)

    # 2 生成特征列
    fixlen_sparse_columns=[SparseFeat(feat, vocabulary_size=df[feat].max() + 1,embedding_dim=embedding_dim)
                           for feat in sparse_features]
    fixlen_dense_columns= [DenseFeat(feat, 1,) for feat in dense_features]

    # 挑选dnn列和linear列
    dnn_feature_columns = fixlen_sparse_columns+fixlen_dense_columns
    linear_feature_columns=fixlen_dense_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    train,val=df[df['date_']<14],df[df['date_']==14]

    # 3 生成模型的输入数据
    train_model_input = {name:train[name].values for name in feature_names}
    test_model_input = {name:test[name].values for name in feature_names}

    val_model_input = {name: val[name] for name in feature_names}
    userid_list = val['userid'].astype(str).tolist() # val中所有uid列表 计算auc需要使用
    test_model_input = {name: test[name] for name in feature_names}
    train_labels = train[action].values
    val_labels = val[action].values

    # 4 构造模型 训练
#     model = DeepFM(linear_feature_columns,dnn_feature_columns,fm_group=sparse_features,
#                    dnn_hidden_units=[128,128,64],seed=1997,task='binary')

    model=WDL(linear_feature_columns,dnn_feature_columns,
              dnn_hidden_units=[128,128,64],seed=1997,task='binary')
#     model=xDeepFM(linear_feature_columns,dnn_feature_columns,
#                    dnn_hidden_units=[128,128,64],seed=1997,task='binary')
#     model=AutoInt(linear_feature_columns, dnn_feature_columns,
#                  seed=1997,task='binary')

    model.compile('adam', "binary_crossentropy",
                  metrics=['binary_crossentropy'], )
    
    for epoch in range(epochs):
        history = model.fit(train_model_input, train_labels,shuffle=True,
                                  batch_size=batch_size, epochs=1, verbose=1)

        val_pred_ans = model.predict(val_model_input, batch_size=batch_size * 4)
        auc=uAUC(val_labels, val_pred_ans, userid_list)
        # 当前模型auc更好时 保存模型 进行预测
        if(auc>best_auc[action]):
            best_auc[action]=auc
            model_root_path='{}/MyModel/{}'.format(ROOT_PATH,model_name)
            mkdir(model_root_path)
            save_model(model, '{}/{}_{}.h5'.format(model_root_path,model_name,action))
            predict_best[action]=model.predict(test_model_input , batch_size=batch_size * 4)[:,0]
#             sub_predict.loc[:,action]=model.predict(test_model_input , batch_size=batch_size * 4)[:,0]
            
        print('epoch:{},auc:{}'.format(epoch+1,auc))

        
weight_dict = {"read_comment": 4, "like": 3, "click_avatar": 2, "favorite": 1, "forward": 1,
                   "comment": 1, "follow": 1}
# 所有模型train完 输出auc
print(best_auc)
# 保存sub_dict
weight_auc=compute_weighted_score(best_auc,weight_dict)
print(weight_auc)
# for action,predict in predict_best.items():
#     syb_preduct[action]=predict
# sub_predict.to_csv('{}/sub_{}_{}.csv'.format(SUB_PATH,weight_auc,model_name),index=False)

******************read_comment********************
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 2422008 samples
epoch:1,auc:0.5940726440792606
Train on 2422008 samples
epoch:2,auc:0.6051314016845029
Train on 2422008 samples
epoch:3,auc:0.6161997374420425
Train on 2422008 samples
epoch:4,auc:0.6162993078771793
Train on 2422008 samples
epoch:5,auc:0.6150741448370238
Train on 2422008 samples
epoch:6,auc:0.6160947358980152
Train on 2422008 samples
epoch:7,auc:0.6135381396925399
Train on 2422008 samples
epoch:8,auc:0.6149826931043069
Train on 2422008 samples
epoch:9,auc:0.6135231169071619
Train on 2422008 samples
epoch:10,auc:0.6144263062317429
******************like********************
Train on 1708750 samples
epoch:1,auc:0.5953048715246398
Train on 1708750 samples
epoch:2,auc:0.5986387247050908
Train on 1708750 samples
epoch:3,auc:0.5976926587874897
Train on 1708750 samples
epoch:4,auc:0.5999416175029504
Train on 1708750 samples
epoch:5,auc:0.600841799

In [15]:
for action,predict in predict_best.items():
    sub_predict[action]=predict
sub_predict.to_csv('{}/sub_{}_{}.csv'.format(SUB_PATH,weight_auc,model_name),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [110]:
# weight_dict = {"read_comment": 4, "like": 3, "click_avatar": 2, "favorite": 1, "forward": 1,
#                    "comment": 1, "follow": 1}
# # 所有模型train完 输出auc
# print(best_auc)
# # 保存sub_dict
# weight_auc=compute_weighted_score(best_auc,weight_dict)
# print(weight_auc)
# sub_predict.to_csv('{}/sub_{}_{}.csv'.format(SUB_PATH,weight_auc,model_name),index=False)

In [94]:
# # 读取模型
# model = load_model('{}/deepfm_{}.h5'.format(model_root_path,action),custom_objects)
# val_pred_ans = model.predict(val_model_input, batch_size=batch_size * 4)
# auc=uAUC(val_labels, val_pred_ans, userid_list)
# print(auc)

In [114]:
# sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id',\
#                    'watch_count_group','video_time_group']

# dense_features = ['videoplayseconds',"watch_count","play_times"]+\
# [b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]

# # dense_features = ['videoplayseconds']

# sparse_features_lens_dict={
#     'userid':30000, 
#     'feedid':120000, 
#     'authorid':20000, 
#     'bgm_song_id':30000,
#     'bgm_singer_id':20000,
#     'watch_count_group':25,
#     'video_time_group':25,
# }
# action='like'

# # 1 读取 train val 数据集
# df=get_df_data(action,day=14)
# # print(train.columns)

# # 2 生成特征列
# # fixlen_feature_columns=[SparseFeat(feat, vocabulary_size=lens,embedding_dim=16)
# #                        for feat,lens in sparse_features_lens_dict.items()]+[DenseFeat(feat, 1,) for feat in dense_features]
# fixlen_sparse_columns=[SparseFeat(feat, vocabulary_size=df[feat].max() + 1,embedding_dim=embedding_dim)
#                        for feat in sparse_features]
# fixlen_dense_columns= [DenseFeat(feat, 1,) for feat in dense_features]
# # id的encoding
# # for feat in sparse_features:
# #     lbe = LabelEncoder()
# #     df[feat] = lbe.fit_transform(df[feat])

# dnn_feature_columns = fixlen_sparse_columns+fixlen_dense_columns
# # linear_feature_columns = fixlen_feature_columns
# linear_feature_columns=fixlen_dense_columns

# feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# train,val=df[df['date_']<14],df[df['date_']==14]

# # 3 生成模型的输入数据
# train_model_input = {name:train[name].values for name in feature_names}
# # test_model_input = {name:test[name].values for name in feature_names}

# val_model_input = {name: val[name] for name in feature_names}
# userid_list = val['userid'].astype(str).tolist() # val中所有uid列表 计算auc需要使用
# # test_model_input = {name: test[name] for name in feature_names}
# train_labels = train[action].values
# val_labels = val[action].values

# # 4 构造模型 训练
# model = DeepFM(linear_feature_columns,dnn_feature_columns,fm_group=sparse_features,
#                dnn_hidden_units=[128,128,64],seed=6666,task='binary')

# #model=WDL(linear_feature_columns,dnn_feature_columns,dnn_hidden_units=[128,128],seed=6666,task='binary')

# model.compile('adam', "binary_crossentropy",
#               metrics=['binary_crossentropy'], )
# best_auc=0.0
# for epoch in range(epochs):
#     history = model.fit(train_model_input, train_labels,
#                               batch_size=batch_size, epochs=1, verbose=1)

#     val_pred_ans = model.predict(val_model_input, batch_size=batch_size * 4)
#     auc=uAUC(val_labels, val_pred_ans, userid_list)
#     if(auc>best_auc):
#         sub_predict[action]=model.predict(test_model_input , batch_size=batch_size * 4)[:,0]
#         best_auc=auc
#         model_root_path=ROOT_PATH+'/MyModel/deepfm'
#         mkdir(model_root_path)
#         save_model(model, '{}/deepfm_{}.h5'.format(model_root_path,action))
#     print('epoch:{},auc:{}'.format(epoch+1,auc))

In [None]:
### 之前单独写的

# sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id',\
#                    'watch_count_group','video_time_group']

# dense_features = ['videoplayseconds',"watch_count","play_times"]+\
# [b+"_sum" for b in FEA_COLUMN_LIST]+[b+"_mean" for b in FEA_COLUMN_LIST]

# # dense_features = ['videoplayseconds']

# sparse_features_lens_dict={
#     'userid':30000, 
#     'feedid':120000, 
#     'authorid':20000, 
#     'bgm_song_id':30000,
#     'bgm_singer_id':20000,
#     'watch_count_group':25,
#     'video_time_group':25,
# }
# action='click_avatar'

# # 1 读取 train val 数据集
# df=get_df_data(action,day=14)
# # print(train.columns)

# # 2 生成特征列
# # fixlen_feature_columns=[SparseFeat(feat, vocabulary_size=lens,embedding_dim=16)
# #                        for feat,lens in sparse_features_lens_dict.items()]+[DenseFeat(feat, 1,) for feat in dense_features]
# fixlen_sparse_columns=[SparseFeat(feat, vocabulary_size=df[feat].max() + 1,embedding_dim=embedding_dim)
#                        for feat in sparse_features]
# fixlen_dense_columns= [DenseFeat(feat, 1,) for feat in dense_features]
# # id的encoding
# # for feat in sparse_features:
# #     lbe = LabelEncoder()
# #     df[feat] = lbe.fit_transform(df[feat])

# dnn_feature_columns = fixlen_sparse_columns+fixlen_dense_columns
# # linear_feature_columns = fixlen_feature_columns
# linear_feature_columns=fixlen_dense_columns

# feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# train,val=df[df['date_']<14],df[df['date_']==14]

# # 3 生成模型的输入数据
# train_model_input = {name:train[name].values for name in feature_names}
# # test_model_input = {name:test[name].values for name in feature_names}

# val_model_input = {name: val[name] for name in feature_names}
# userid_list = val['userid'].astype(str).tolist() # val中所有uid列表 计算auc需要使用
# # test_model_input = {name: test[name] for name in feature_names}
# train_labels = train[action].values
# val_labels = val[action].values

# # 4 构造模型 训练
# # model = DeepFM(linear_feature_columns,dnn_feature_columns,fm_group=sparse_features,
# #                dnn_hidden_units=[128,128,64],seed=6666,task='binary')

# model=WDL(linear_feature_columns,dnn_feature_columns,dnn_hidden_units=[128,128],seed=6666,task='binary')

# model.compile('adam', "binary_crossentropy",
#               metrics=['binary_crossentropy'], )
# best_auc=0.0
# for epoch in range(epochs):
#     history = model.fit(train_model_input, train_labels,
#                               batch_size=batch_size, epochs=1, verbose=1)

#     val_pred_ans = model.predict(val_model_input, batch_size=batch_size * 4)
#     auc=uAUC(val_labels, val_pred_ans, userid_list)
#     if(auc>best_auc):
#         best_auc=auc
#         model_root_path=ROOT_PATH+'/MyModel/deepfm'
#         mkdir(model_root_path)
#         save_model(model, '{}/deepfm_{}.h5'.format(model_root_path,action))
#     print('epoch:{},auc:{}'.format(epoch+1,auc))