# 输入格式

"""
input : {"creative_id":"1,2,3,4,5",
        "ad_id":"1,2,3,4,5",
        "advertiser_id":"1,2,3,4,5",
        "product_id":"1,2,3,4,5",
        "product_category":"1,2,3,4,5",
        "industry":"1,2,3,4,5"}
        
label1 : age  sigmoid
label2 : gender softmax
"""


# 加载数据
df_ad,df_click_log,df_user,df_click_log_test = loadData("./train/",mode="pro")

# 整合宽表
df_total_train = df_click_log.merge(df_user, how='left', on="user_id")
df_total_train = df_total_train.merge(df_ad, how='left', on="creative_id")
df_total_train = df_total_train.fillna("UNK").replace(["\\N"],"UNK")
df_total_train.head(5)

df_total_test = df_click_log_test.merge(df_ad, how='left', on="creative_id")
df_total_test = df_total_test.fillna("UNK").replace(["\\N"],"UNK")
df_total_test.head(5)

# 设定参数
features = ["creative_id","ad_id","product_id","product_category","advertiser_id","industry"]
SEQ_LEN = 45
negsample = 0
feature_max_idx = {}

dict_table = {}
max_num = {}

for key in features:
    ll = [df_total_train[key].tolist(),df_total_test[key].tolist()]
    table,_table = encode(ll)
    df_total_train[key] = transform(df_total_train[key],table)
    df_total_test[key] = transform(df_total_test[key],table)
    dict_table[key] = table
    max_num[key] = max(dict_table[key].values())
    
pickle.dump(dict_table,open(base_path+"data.table",'wb'))
#dict_table = pickle.load(open("./"+method+"/data.table",'rb'))

data_train = df_total_train.sort_values("time")

data_train = pd.read_csv("dataTrain.csv")
data_train = pd.read_csv("dataTest.csv")

data_train["age"] = data_train["age"] - 1
data_train["gender"] = data_train["gender"] - 1

# 输入数据格式 "age#gender#cseq#adseq#pseq#pcseq#aseq#iseq"
features = ["creative_id","ad_id","product_id","product_category","advertiser_id","industry"]
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

with open(base_path+"tfdata.train","w") as f,open(base_path+"tfdata.val","w") as f1:
    for indx,hist in data_train.groupby("user_id"):
        age = str(hist["age"].max())
        gender = str(hist["gender"].max())
        s = [age,gender]
        for item in features:
            ll = hist[item].astype(str)
            llist = pad_sequences([ll], maxlen=60, padding='post', value='0',dtype=object).tolist()[0]
            # list to str
            s.append(",".join(llist))
        if random.random() >= test_size:
            f.write("#".join(s)+"\n")
        else:
            f1.write("#".join(s)+"\n")
        

with open(base_path+"tfdata.test","w") as f:
    for indx,hist in df_total_test.groupby("user_id"):
        s = []
        for item in features:
            ll = hist[item].astype(str)
            llist = pad_sequences([ll], maxlen=60, padding='post', value='0',dtype=object).tolist()[0]
            # list to str
            s.append(",".join(llist))
        f.write("#".join(s)+"\n")

In [3]:
from sklearn.preprocessing import LabelEncoder
from utils.utils import loadData,encode,transform,getSeq,getUserAvgSeq,getUserAvgSeq,upload
from model.W2V import wvmodel
from model.LGB import base_predict,base_train
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils.utils import encode,transform
import os
import tensorflow as tf
import pickle
import shutil
import random

method = "test_nn_seq"
base_path = "./test_nn_seq/"
test_size = 0.2

if not os.path.exists("./"+method):
    os.mkdir("./"+method)


In [13]:
def setConfig(dynamic_alloc=True,gpu_no="1",save_checkpoints_steps=1000,print_steps=1000,seed=2020):
    gpu_config = tf.ConfigProto(
                        log_device_placement=True,
                        inter_op_parallelism_threads=0,
                        intra_op_parallelism_threads=0,
                        allow_soft_placement=True)
    gpu_config.gpu_options.visible_device_list = gpu_no
    gpu_config.gpu_options.allow_growth = dynamic_alloc
    #gpu_config.gpu_options.allocator_type = 'BFC'
    run_config = tf.estimator.RunConfig().replace(save_checkpoints_steps=save_checkpoints_steps,
                                                  tf_random_seed=seed,
                                                  session_config=gpu_config,
                                                  log_step_count_steps=print_steps,
                                                  save_summary_steps=10000,
                                                  keep_checkpoint_max=20)
    return run_config


class EMNN:
    
    def __init__(self, config, params):
        self.config = config
        self.model = None
        global fparams
        fparams = params
    
    def input_fn(self, filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
        print('Parsing', filenames)
        features = ["creative_id","ad_id","product_id","product_category","advertiser_id","industry"]
        
        def process (columns,features):
            res = []
            for indx in range(2,len(features)+2):
                ids = tf.string_split([columns.values[indx]],",")
                ids = tf.reshape(ids.values, ids.dense_shape)
                ids = tf.string_to_number(ids, out_type=tf.int32)
                res.append(ids)
            return dict(zip(features,res))
        
        def decode_libsvm(line):
            columns = tf.string_split([line], '#')
            age = tf.string_to_number(columns.values[0], out_type=tf.float32)
            gender = tf.string_to_number(columns.values[1], out_type=tf.float32)
            feature = process(columns,features)
            labels = {"age":age,"gender":gender}
            return feature,labels
        
        dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        return batch_features,batch_labels
    
    @staticmethod
    def Pooling(input_tensor, axis=2, method="sum", name=None):
        """
        Input:
        input_tensor - tensor with any shape
        axis - the axis to operate
        method - the operation to process on the choosed axis

        Output:
        tensor
        """
        if axis not in list(range(len(input_tensor.shape))):
            raise ValueError(
                "Unexpected axis %d, axis expect to include in {}".format(list(range(len(input_tensor.shape)))) % axis)
        if method == "min":
            pol = tf.reduce_min(input_tensor, axis=axis)
        elif method == "avg":
            pol = tf.reduce_mean(input_tensor, axis=axis)
        elif method == "max":
            pol = tf.reduce_max(input_tensor, axis=axis)
        else:
            pol = tf.reduce_sum(input_tensor, axis=axis)
        return pol
    
    
    def model_fn(self,features, labels, mode,params):
        
        """Bulid Model function f(x) for Estimator."""
        
        ## feature_size ##
        # ["creative_id","ad_id","product_id","product_category","advertiser_id","industry"]
        
        # creative_size 
        creative_size = params["creative_size"] + 1
        # ad_size 
        ad_size = params["ad_size"] + 1
        # product_size 
        product_size = params["product_size"] + 1
        # product_category_size
        product_category_size = params["product_category_size"] + 1
        # advertiser_size
        advertiser_size = params["advertiser_size"] + 1
        # industry_size 
        industry_size = params["industry_size"] + 1
        
        ## embedding_size ##
        
        # creative_size 
        creative_emb_size = params["creative_emb_size"]
        # ad_size 
        ad_emb_size = params["ad_emb_size"]
        # product_size 
        product_emb_size = params["product_emb_size"]
        # product_category_size
        product_category_emb_size = params["product_category_emb_size"]
        # advertiser_size
        advertiser_emb_size = params["advertiser_emb_size"]
        # industry_size 
        industry_emb_size = params["industry_emb_size"]
        
        ## attention_size ##
        
        # a_size
        a_size = params["a_size"]
        
        ## training parameter ## 
        seq_len = params["seq_len"]
        b_layers = params["b_layers"]
        t_layers = params["t_layers"]
        l2_reg = params["l2_reg"]
        learning_rate = params["learning_rate"]

        ## bulid embedding weights ##
        
        embw_cre = tf.get_variable( 
            name='creative_embedding', 
            shape=[creative_size,creative_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        embw_ad = tf.get_variable(
            name='ad_embedding', 
            shape=[ad_size,ad_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        embw_pro = tf.get_variable(
            name='product_embedding', 
            shape=[product_size,product_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        embw_pcate = tf.get_variable(
            name='pcate_embedding', 
            shape=[product_category_size,product_category_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        embw_adver = tf.get_variable(
            name='adver_embedding', 
            shape=[advertiser_size,advertiser_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        embw_ind = tf.get_variable(
            name='industry_embedding', 
            shape=[industry_size,industry_emb_size], 
            initializer=tf.glorot_normal_initializer())
                   
        ## load features ## 
        # ["creative_id","ad_id","product_id","product_category","advertiser_id","industry"]
                   
        cre_ids = tf.reshape(features["creative_id"],[-1,seq_len])
        ad_ids =  tf.reshape(features["ad_id"],[-1,seq_len])
        pro_ids = tf.reshape(features["product_id"],[-1,seq_len])
        pro_cate_ids = tf.reshape(features["product_category"],[-1,seq_len])
        adv_ids = tf.reshape(features["advertiser_id"],[-1,seq_len])
        ind_ids = tf.reshape(features["industry"],[-1,seq_len])
        age = labels["age"]
        age = tf.one_hot(tf.cast(age,tf.int32),params["num_class"])
        gender = labels["gender"]
        
        ## embedding ##
        cre_embedding = tf.nn.embedding_lookup(embw_cre, cre_ids) # None * num * emb_size 
        ad_embedding = tf.nn.embedding_lookup(embw_ad,ad_ids)
        pro_embedding = tf.nn.embedding_lookup(embw_pro,pro_ids)
        pcate_embedding = tf.nn.embedding_lookup(embw_pcate,pro_cate_ids)
        adv_embedding = tf.nn.embedding_lookup(embw_adver,adv_ids)
        ind_embedding = tf.nn.embedding_lookup(embw_ind,ind_ids) 
                   
        ## pooling ##  
        cre_embedding = EMNN.Pooling(cre_embedding, axis=1, method="avg") # None * 1 * emb_size 
        ad_embedding = EMNN.Pooling(ad_embedding, axis=1, method="avg")
        pro_embedding = EMNN.Pooling(pro_embedding, axis=1, method="avg")
        pcate_embedding = EMNN.Pooling(pcate_embedding, axis=1, method="avg")
        adv_embedding = EMNN.Pooling(adv_embedding, axis=1, method="avg")
        ind_embedding = EMNN.Pooling(ind_embedding, axis=1, method="avg")
                
        ## build input for DNN ##
        deep_inputs = tf.concat([cre_embedding,
                                 ad_embedding,
                                 pro_embedding,
                                 pcate_embedding,
                                 adv_embedding,
                                 ind_embedding],-1)

        with tf.variable_scope("shallow_layer"):
            for i,l in enumerate(b_layers):
                deep_inputs = tf.contrib.layers.fully_connected(inputs=deep_inputs, num_outputs=l,activation_fn=tf.nn.relu,
                                                weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), 
                                                scope='mlp%d' % i)
        
        ## output layer 1 ##
        with tf.variable_scope("gender_layer"):
            h1 = tf.contrib.layers.fully_connected(inputs=deep_inputs, num_outputs=t_layers,activation_fn=tf.nn.relu,
                                                weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
        
        
        
        with tf.variable_scope("gender_out"):
            out1 = tf.contrib.layers.fully_connected(inputs=h1, num_outputs=1,activation_fn=tf.identity,
                                                weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
                   
        ## output layer 2 ##
        with tf.variable_scope("age_layer"):
            h2 = tf.contrib.layers.fully_connected(inputs=deep_inputs, num_outputs=t_layers,activation_fn=tf.nn.relu,
                                                weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
        
        
        with tf.variable_scope("age_out"):
            out2 = tf.contrib.layers.fully_connected(inputs=h2, num_outputs=10,activation_fn=tf.identity,
                                                weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
            
            
        pred_gender_logit = tf.reshape(out1,shape=[-1])
        pred_age_logit = out2
        pred_gender = tf.nn.sigmoid(pred_gender_logit)
        pred_age = tf.arg_max(tf.nn.softmax(pred_age_logit),0)
        predictions={"age": pred_age,"gender":pred_gender}
                   
        export_outputs = {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions)}
        # Provide an estimator spec for `ModeKeys.PREDICT`
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                export_outputs=export_outputs)

        #------bulid loss------
                   
        age_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=age,logits=pred_age_logit))
                   
        gender_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=gender,logits=pred_gender_logit))
                
        loss = 0.4 * gender_loss + 0.6 * age_loss
               #l2_reg * tf.nn.l2_loss(W_mlp_user) + \
        

        # Provide an estimator spec for `ModeKeys.EVAL`
        eval_metric_ops = {
            "acc_age":tf.compat.v1.metrics.accuracy(tf.arg_max(age,0),pred_age),
            "auc_gender":tf.compat.v1.metrics.auc(gender, pred_gender)
        }
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            loss=loss,
            eval_metric_ops=eval_metric_ops)

        #------bulid optimizer------
        if params["optimizer"] == 'Adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
        elif params["optimizer"] == 'Adagrad':
            optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
        elif params["optimizer"] == 'Momentum':
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95)
        elif params["optimizer"] == 'ftrl':
            optimizer = tf.train.FtrlOptimizer(learning_rate)

        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

        # Provide an estimator spec for `ModeKeys.TRAIN` modes
        if mode == tf.estimator.ModeKeys.TRAIN:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                loss=loss,
                train_op=train_op)
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
                   
    def compile(self,clear_existing_model=True):
        run_config = self.config
        model_dir = './tmp/checkpoint/{}'.format(self.__class__.__name__)
        if clear_existing_model:
            try:
                shutil.rmtree(model_dir)
            except Exception as e:
                print(e, "at clear_existing_model")
        else:
            print("existing model cleaned at {}".format(model_dir))
        self.model = tf.estimator.Estimator(model_fn=self.model_fn, model_dir=model_dir, params=fparams,
                                            config=run_config)

    def train(self, tr_files, va_files):
        self.model.train(input_fn=lambda: self.input_fn(tr_files, num_epochs=fparams["num_epochs"],
                                                        batch_size=fparams["batch_size"]))

    def evaluate(self, va_files):
        self.model.evaluate(input_fn=lambda: self.input_fn(va_files, num_epochs=1, batch_size=fparams["batch_size"]))

    def train_and_evaluate(self, tr_files, va_files):
        evaluator = tf.estimator.experimental.InMemoryEvaluatorHook(
            estimator=self.model,
            input_fn=lambda: self.input_fn(va_files, num_epochs=1, batch_size = fparams["batch_size"]),
            every_n_iter = fparams["val_itrs"])
        self.model.train(
            input_fn=lambda: self.input_fn(tr_files, num_epochs= fparams["num_epochs"],
                                           batch_size= fparams["batch_size"]),
            hooks=[evaluator])

    def predict(self, te_files, isSave=False, numToSave=10):
        P_G = self.model.predict(input_fn=lambda: self.input_fn(te_files, num_epochs=1, batch_size=1),
                                 predict_keys="prob")
        if isSave:
            with open(te_files, 'r') as f1, open('sample.unitest', "w") as f2:
                for i in range(numToSave):
                    sample = f1.readline()
                    result = next(P_G)
                    pred = str(result['prob'])
                    f2.write('\t'.join([pred, sample]))

In [None]:
params = {
    "creative_size":3412772,
    "ad_size":2264191,
    "product_size":33273,
    "product_category_size":19,
    "advertiser_size":52091,
    "industry_size":326,
    "seq_len":60,

    "creative_emb_size":128,      # 1G左右
    "ad_emb_size":128,            # 1G左右
    "product_emb_size":32,
    "product_category_emb_size":4,
    "advertiser_emb_size":32,
    "industry_emb_size":4,
    
    "a_size":4,
    "num_class":10,
    "b_layers":[32,16],
    "t_layers":8,
    "optimizer": 'Adam',
    "num_epochs": 10,
    "batch_size": 16,
    "learning_rate": 0.01,
    "l2_reg": 0.01,
    "val_itrs": 1000
}

run_config = setConfig(dynamic_alloc=True, gpu_no='0', save_checkpoints_steps=1000, print_steps=100, seed=2020)
model = EMNN(run_config,params)
model.compile(clear_existing_model=False)
model.train_and_evaluate("./test_nn_seq/tfdata.train","./test_nn_seq/tfdata.val")

existing model cleaned at ./tmp/checkpoint/EMNN
INFO:tensorflow:Using config: {'_model_dir': './tmp/checkpoint/EMNN', '_tf_random_seed': 2020, '_save_summary_steps': 10000, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': gpu_options {
  allow_growth: true
  visible_device_list: "0"
}
allow_soft_placement: true
log_device_placement: true
, '_keep_checkpoint_max': 20, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f09d44959b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Parsing ./test_nn_seq/t