# 参数配置


In [None]:
embed_size = 128
maxlen = 150 # importance
random_seed = 123 # importance
batch_size =256
epochs = 20


In [None]:
import os
if not os.path.exists('./result'):
    os.makedirs('./result')
if not os.path.exists('./temp'):
    os.makedirs('./temp')


In [None]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import os
import pickle
from datetime import datetime

from tqdm import tqdm_notebook
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical,Sequence
from tensorflow.keras import Input,Model
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense,concatenate,Activation,BatchNormalization
from tensorflow.keras.layers import TimeDistributed,Dropout,Lambda,Conv1D,GlobalMaxPooling1D,GlobalAveragePooling1D,CuDNNLSTM
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,SpatialDropout1D,GlobalMaxPool1D,MaxPooling1D,Reshape
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint
from tensorflow.keras.optimizers import Adagrad
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers

In [None]:
train_root_path ='./data/train_preliminary/'
train_ad_path = os.path.join(train_root_path,'ad.csv')
train_click_path = os.path.join(train_root_path,'click_log.csv')
train_user_path = os.path.join(train_root_path,'user.csv')

In [None]:
semi_train_root_path ='./data/train_semi_final/'
semi_train_ad_path = os.path.join(semi_train_root_path,'ad.csv')
semi_train_click_path = os.path.join(semi_train_root_path,'click_log.csv')
semi_train_user_path = os.path.join(semi_train_root_path,'user.csv')

In [None]:
test_root_path = './data/test/'
test_ad_path = os.path.join(test_root_path,'ad.csv')
test_click_path = os.path.join(test_root_path,'click_log.csv')

# 加载数据缓存，提升运行效率

In [None]:
word_indexs = pickle.load(open("./data/cache/word_index.pkl","rb"))

In [None]:
df_doc = pd.read_parquet('./data/cache/final_padding.parquet_8input')

In [None]:
click_time_mask =pd.read_parquet('./data/cache/click_times_mask.parquet')

In [None]:
aggcol = ['creative_id','ad_id','product_id', 'advertiser_id','industry']
#'click_times','time'
# aggcol = ['creative_id','ad_id','product_id','product_category', 'advertiser_id','industry','click_times','time']

#'click_times','time'
buckcol = ['click_times_buck']


### 加载常规embeding

In [None]:
from gensim.models import KeyedVectors
import gc
    
embeding_group ={}
for col in aggcol:
    wv = KeyedVectors.load(f"./model/word2vec_{col}_128.model", mmap='r')
    nb_words = len(word_indexs[col])+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count= 0
    for word, i in tqdm(word_indexs[col].items()):
        try:
            embedding_vector = wv[word]
        except:
            print(word)
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    embeding_group[col] = embedding_matrix
    print(f"{col}: null cnt {count}")
    del wv
    gc.collect()

### 加载随机游走 生成emebding

In [None]:
from gensim.models import KeyedVectors
import gc
    
embeding_deepwalk_group ={}
for col in aggcol:
    wv = KeyedVectors.load(f"./model/deepwalk/word2vec_{col}_128.model", mmap='r')
    nb_words = len(word_indexs[col])+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count= 0
    for word, i in tqdm(word_indexs[col].items()):
        try:
            embedding_vector = wv[word]
        except:
            print(word)
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    embeding_deepwalk_group[col] = embedding_matrix
    print(f"{col}: null cnt {count}")
    del wv
    gc.collect()

In [None]:
df_user_pre = pd.read_csv(train_user_path)
df_user_semi = pd.read_csv(semi_train_user_path)
df_user =  pd.concat([df_user_pre,df_user_semi], axis=0)
df_user.drop_duplicates(inplace=True)

In [None]:
del df_user_pre
del df_user_semi
import gc
gc.collect()

# 生成训练，测试数据

In [None]:
ad_docs_withuser = df_doc.merge(df_user,on=['user_id'],how='left')

In [None]:
click_time_mask = click_time_mask.rename(columns={'click_times':'click_times_mask'})

In [None]:
ad_docs_withuser = ad_docs_withuser.merge(click_time_mask,on=['user_id'],how='left')

In [None]:
df_test_final = ad_docs_withuser[ad_docs_withuser.age.isna()]
df_train_val_final = ad_docs_withuser[~ad_docs_withuser.age.isna()]

In [None]:
label_age = to_categorical(df_train_val_final['age'] - 1)
label_gender = to_categorical(df_train_val_final['gender'] - 1) #或者tf.one_hot(labels, num_cls)

In [None]:
#转成numpy. 用于kfold
dataset_train_x ={}
dataset_test_x = {}
for col in aggcol +  ['click_times_mask','click_times_length']  :
    dataset_train_x[col] = np.stack(df_train_val_final[col].values)
    dataset_test_x[col] =  np.stack(df_test_final[col].values)

dataset_train_y = {'age_out':label_age, 'gender_out':label_gender}

In [None]:
dataset_train_x['user_id'] = df_train_val_final['user_id'] 
dataset_test_x['user_id'] = df_test_final['user_id'] 

In [None]:
del df_user
del ad_docs_withuser
del click_time_mask
import gc
gc.collect()

In [None]:
del word_indexs
del df_train_val_final
del df_test_final
del df_doc
gc.collect()

# 模型定义

In [None]:
mirrored_strategy = tf.distribute.MirroredStrategy()


In [None]:
embeds= []
all_inputs = []
#sdrop=SpatialDropout1D(rate=0.2)
# with mirrored_strategy.scope():
for col in aggcol:
    input_x = Input(shape=(maxlen,),name =col)
    trainable = False
    embed_x = Embedding(
        input_dim=embeding_group[col].shape[0],
        output_dim=embeding_group[col].shape[1],
        weights=[embeding_group[col]],
        input_length=maxlen,
        trainable=trainable,
        name = col+'_embeding'
    )(input_x)
    embeds.append(embed_x)
    all_inputs.append(input_x)
click_buck_input = Input(shape=(maxlen,),name ='click_times_mask')
click_length_input = Input(shape=(),name ='click_times_length')
    # usertfidf_input = Input(shape=(400,),name ='usertfidf')

all_inputs.append(click_buck_input)
all_inputs.append(click_length_input)
# all_inputs.append(usertfidf_input)


In [None]:
# del embeding_group
# gc.collect()

# 年龄训练和预测

In [None]:
## 标准双层lstm。 亮点在于每层都进行不同的mask处理。尽量降低干扰
def generate_model(embeds,all_inputs,click_buck_input,click_length_input):
    temp_out=[]

    orgin_mask= click_buck_input[:, :, tf.newaxis]
    clip_mask = tf.clip_by_value(orgin_mask, clip_value_min=0, clip_value_max=1)
    max_mask = (1.0 - clip_mask) * -10000.0
    
    x = concatenate(embeds)
    x = Bidirectional(CuDNNLSTM(480, return_sequences=True,recurrent_initializer='glorot_uniform'))(x)
    x = x  * clip_mask
    x = Bidirectional(CuDNNLSTM(480,return_sequences=True))(x)
    for_max = x+max_mask

    max_pool = GlobalMaxPooling1D()(for_max)
    temp_out.append(max_pool)
    x = temp_out[0]#concatenate(temp_out)
    x = Dense(256, activation='relu')(x)
    x = Dense(128,activation='relu')(x)
    age_pred = Dense(10, activation='softmax',name ='age_out')(x)
    model = Model(inputs=all_inputs, outputs=[age_pred])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',  metrics={"age_out": "accuracy"})
    return model
    

In [None]:
generate_model(embeds,all_inputs,click_buck_input,click_length_input).summary()

### 学习率衰减。通过观察可得第九个epoche下降 效果较好


In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
def decay_schedule(epoch, lr):
    # decay by 0.1 every 5 epochs; use `% 1` to decay after each epoch
    if epoch  == 9:
        lr = lr * 0.1
    return lr
lr_scheduler = LearningRateScheduler(decay_schedule)

### 数据增强技术。每轮生成增强数据
#### 保留点击序列2/3以上的点击item，mask剩余部分
#### 打乱点击序列顺序，据数据分析，点击顺序跟年龄性别无关

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, dataset_train, labels_train, batch_size=128):
        'Initialization'
        self.dataset_train = dataset_train
        self.labels_train = labels_train
        self.batch_size = batch_size
        self.click_time_length = self.dataset_train['click_times_length']
        self.total_size = self.click_time_length.shape[0]
        self.indexes = np.arange(self.total_size)
        
        self.enhanced_data = {}
        self.epoch_count = 0
        self.on_epoch_end()
    def on_epoch_end(self):
        #shuffle data
#         if self.epoch_count%4 == 0:
        del self.enhanced_data
        gc.collect()
        self.enhanced_data= {}

        keys = [key for key in self.dataset_train.keys() if key != 'click_times_length' and key != 'user_id']

        for key in keys:
            self.enhanced_data[key] =[]
        for index,value in tqdm(enumerate(self.click_time_length)):
            if value > maxlen:
                value = maxlen
            big = np.random.randint(int(value/3.0*2),value)
            sampled_action = np.random.choice(value, big, replace=True)
            for key in keys:
                row = self.dataset_train[key][index]
                self.enhanced_data[key].append(np.hstack([row[sampled_action], np.array([0]* (maxlen -len(sampled_action)))]))
        for key in keys:
            self.enhanced_data[key] = np.stack(self.enhanced_data[key])

        self.enhanced_data['click_times_length'] =self.click_time_length
        np.random.shuffle(self.indexes)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.total_size / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        high = (index+1)*self.batch_size
        if high >self.total_size:
            high = self.total_size
        batch_data={}
        selected = self.indexes[index*self.batch_size:high]
        for key in self.enhanced_data.keys():
            batch_data[key] = self.enhanced_data[key][selected]
        return batch_data, self.labels_train[selected]

# 五折运行

In [None]:
from sklearn.model_selection import StratifiedKFold
import gc
import pickle
folds =[]
if not os.path.exists('./temp/folder.pkl'):
    sub = np.zeros((dataset_test_x['creative_id'].shape[0], 10))
    skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
    count = 0
    for i, (train_index, test_index) in enumerate(skf.split(dataset_train_x['creative_id'], dataset_train_y['age_out'][:,0])):
        folds.append((train_index,test_index))
    f = open("./temp/folder.pkl","wb")
    pickle.dump(folds,f)
    f.close()
else:
    f = open("./temp/folder.pkl","rb")
    folds = pickle.load(f)
    f.close()

In [None]:
#写日志，防止notebook session失效后的观察
class CustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        f = open("log.txt", "a+")
        f.write(f"{current_time}, epoch:{epoch},val_acc:{logs['val_acc']}\n")
        f.close()

In [None]:
from sklearn.model_selection import StratifiedKFold
import gc
score = []
sub = np.zeros((dataset_test_x['creative_id'].shape[0], 10))
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
count = 0
for i, (train_index, test_index) in enumerate(folds):
    print("FOLD | ", i+1)
    print("###"*35)
    gc.collect()
    filepath = f"model/age_{i}/nn_age_v1.tf"
#     with mirrored_strategy.scope():

    #model_age.load_weights(filepath)
    
    fold_data_train_x ={}
    fold_data_val_x ={}
    for col in aggcol + ['click_times_mask','click_times_length']:
        fold_data_train_x[col] = dataset_train_x[col][train_index]
        fold_data_val_x[col] = dataset_train_x[col][test_index]
    fold_data_train_y = {'age_out':label_age[train_index]}#, 'gender_out':label_gender[train_index]}
    fold_data_val_y= {'age_out':label_age[test_index]}#, 'gender_out':label_gender[test_index]}
    data_generator =DataGenerator(fold_data_train_x,fold_data_train_y['age_out'],batch_size)

    checkpoint = ModelCheckpoint(
        filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
    reduce_lr = ReduceLROnPlateau(
        monitor='val_acc', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_acc', min_delta=0.001, patience=6, verbose=1, mode='max')
    logcallback = CustomCallback()#([data_generator.on_epoch_end])
    callbacks = [checkpoint,logcallback, lr_scheduler, earlystopping]

    model_age  = generate_model(embeds,all_inputs,click_buck_input,click_length_input)
    hist = model_age.fit(data_generator, epochs=epochs, 
                         validation_data=(fold_data_val_x, fold_data_val_y),
                         callbacks=callbacks, verbose=1, shuffle=True)
#     hist = model_age.fit(fold_data_train_x,fold_data_train_y,batch_size=batch_size, epochs=epochs, 
#                          validation_data=(fold_data_val_x, fold_data_val_y),
#                          callbacks=callbacks, verbose=1, shuffle=True)
    del data_generator
    gc.collect()
    model_age.load_weights(filepath)
    #oof_pred[test_index] = model_age.predict(dataset_test_x,batch_size=512,verbose=1)
    tem = model_age.predict(dataset_test_x,batch_size=512,verbose=1)
    np.save(f'./temp/age_folder_{i}.npy',tem)
    sub += tem/skf.n_splits
    score.append(np.max(hist.history['val_acc']))
    count += 1
print('acc:', np.mean(score))

In [None]:
import gc
gc.collect()

In [None]:
age_result = sub

In [None]:
np.save('./result/prop/age_result',age_result)

In [None]:
age_arg = np.argmax(age_result,axis=1)+1
final = pd.DataFrame({'user_id':dataset_test_x['user_id'],'predicted_age':age_arg,'predicted_gender':[-1]*len(age_arg)})
final.to_csv('./temp/w2v_only_age_0710_05135.csv',index=False)

# 性别训练和预测

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
def gender_decay_schedule(epoch, lr):
    # decay by 0.1 every 5 epochs; use `% 1` to decay after each epoch
    if epoch  == 3:
        lr = lr * 0.1
    return lr
lr_gender_scheduler = LearningRateScheduler(gender_decay_schedule)

In [None]:
# 微调模型参数，获取最佳效果
def generate_gender_model(embeds,all_inputs,click_buck_input,click_length_input):
    temp_out=[]

    orgin_mask= click_buck_input[:, :, tf.newaxis]
    clip_mask = tf.clip_by_value(orgin_mask, clip_value_min=0, clip_value_max=1)
    max_mask = (1.0 - clip_mask) * -10000.0
    
    x = concatenate(embeds)
    x = Bidirectional(CuDNNLSTM(640, return_sequences=True,recurrent_initializer='glorot_uniform'))(x)
    x = x  * clip_mask
    x = Bidirectional(CuDNNLSTM(320,return_sequences=True))(x)
    for_max = x+max_mask
    
    max_pool = GlobalMaxPooling1D()(con_x)
    temp_out.append(max_pool)

    x = temp_out[0]#concatenate(temp_out)
    x = Dense(256, activation='relu')(x)
    x = Dense(128,activation='relu')(x)
    age_pred = Dense(2, activation='softmax',name ='gender_out')(x)
    model = Model(inputs=all_inputs, outputs=[age_pred])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',  metrics={"gender_out": "accuracy"})
    return model
    

In [None]:
from sklearn.model_selection import StratifiedKFold
import gc
score = []

sub = np.zeros((dataset_test_x['creative_id'].shape[0], 2))
skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
count = 0
for i, (train_index, test_index) in enumerate(folds):
    print("FOLD | ", count+1)
    print("###"*35)
    gc.collect()
    filepath = f"model/gender_{i}/nn_gender_v1.tf"
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
    reduce_lr = ReduceLROnPlateau(
        monitor='val_acc', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_acc', min_delta=0.001, patience=5, verbose=1, mode='max')
    callbacks = [checkpoint, lr_gender_scheduler, earlystopping]
    
    model_gender = generate_gender_model(embeds,all_inputs,click_buck_input,click_length_input)

    if i==0:
        model_gender.summary()
    fold_data_train_x ={}
    fold_data_val_x ={}
    for col in aggcol + ['click_times_mask','click_times_length']:
        fold_data_train_x[col] = dataset_train_x[col][train_index]
        fold_data_val_x[col] = dataset_train_x[col][test_index]
    fold_data_train_y = {'gender_out':label_gender[train_index]}
    fold_data_val_y= { 'gender_out':label_gender[test_index]}
    data_generator =DataGenerator(fold_data_train_x,fold_data_train_y['gender_out'],batch_size)
    
    hist = model_gender.fit(data_generator, epochs=epochs, 
                         validation_data=(fold_data_val_x, fold_data_val_y),
                         callbacks=callbacks, verbose=1, shuffle=True)
    model_gender.load_weights(filepath)
    sub += model_gender.predict(dataset_test_x,batch_size=512,verbose=1)/skf.n_splits
    score.append(np.max(hist.history['val_accuracy']))
    count += 1
print('acc:', np.mean(score))

In [None]:
gender_result = sub

In [None]:
np.save('./result/prop/gender_result',gender_result)

In [None]:
age_arg = np.argmax(age_result,axis=1)+1
gender_arg = np.argmax(gender_result,axis=1)+1

In [None]:
final = pd.DataFrame({'user_id':dataset_test_x['user_id'],'predicted_age':age_arg,'predicted_gender':gender_arg})
final.to_csv('./result/w2v_age_gender.csv',index=False)

# 后处理。进行权重搜索

In [None]:
####opt
from sklearn.metrics import accuracy_score
class_num=10
#ground_truth = np.argmax(dataset_train_y['age_out'],axis=1)  
def search_weight(valid_y, raw_prob, init_weight=[1.0]*class_num, step=0.001):
    weight = init_weight.copy()
    f_best = accuracy_score(y_true=valid_y, y_pred=raw_prob.argmax(
        axis=1))
    flag_score = 0
    round_num = 1
    while(flag_score != f_best):
        print("round: ", round_num)
        round_num += 1
        flag_score = f_best
        for c in range(class_num):
            for n_w in range(0, 2000,10):
                num = n_w * step
                new_weight = weight.copy()
                new_weight[c] = num

                prob_df = raw_prob.copy()
                prob_df = prob_df * np.array(new_weight)

                f = accuracy_score(y_true=valid_y, y_pred=prob_df.argmax(
                    axis=1))
                if f > f_best:
                    weight = new_weight.copy()
                    f_best = f
                    print(f)
    return weight


In [None]:
def get_weighted_result(i):
    model_age  = generate_model(embeds,all_inputs,click_buck_input,click_length_input)
    model_age.load_weights(f"model/age_{i}/nn_age_v1.tf")
    train_index, test_index = folds[i]
    
    fold_data_train_x ={}
    fold_data_val_x ={}
    for col in aggcol + ['click_times_mask','click_times_length']:
        fold_data_train_x[col] = dataset_train_x[col][train_index]
        fold_data_val_x[col] = dataset_train_x[col][test_index]
    fold_data_train_y = {'age_out':label_age[train_index]}#, 'gender_out':label_gender[train_index]}
    fold_data_val_y= {'age_out':label_age[test_index]}#, 'gender_out':label_gender[test_index]}
    
    val_truth = np.argmax(fold_data_val_y['age_out'],axis=1)
    val_predict = model_age.predict(fold_data_val_x,batch_size=512,verbose=1)
    
    val_acc = accuracy_score(y_true=val_truth, y_pred=val_predict.argmax(axis=1))
    print(f'origin acc:{val_acc}')
    param = search_weight(val_truth,val_predict)
    weight_acc = accuracy_score(y_true=val_truth, y_pred=(val_predict*param).argmax(axis=1))
    print(f'post acc:{weight_acc}')
    
    test_predict = model_age.predict(dataset_test_x,batch_size=512,verbose=1)
    return test_predict*param,param

In [None]:
result,weight = get_weighted_result(2)


In [None]:
np.save('./temp/age_folder_2_post.npy',result/5)

In [None]:
result,weight = get_weighted_result(4)


In [None]:
np.save('./temp/age_folder_4_post.npy',result/5)

In [None]:
np.sum(result,axis=1)