@Copyright IQIYI 2021
http://challenge.ai.iqiyi.com/

In [1]:
import pandas as pd
import numpy as np
import json
import math
import tensorflow as tf
from sklearn.model_selection import KFold,StratifiedKFold
from tensorflow.keras.utils import Sequence

In [2]:
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
data_dir = "./data/new_data_B/"
model_dir = "./data/model/model_kfold_{}.h5"
submit_dir = "./data/submit/"

In [4]:
#制作一个迭代器，迭代器里面的每个元素是一个bt=n的step
#https://blog.csdn.net/weixin_37737254/article/details/103884255
class DataGenerator(Sequence):
    def __init__(self, df, batch_size):
        self.data = df
        self.num = df.shape[0]
        self.batch_size = batch_size
        self.fea = ['father_id_score', 'cast_id_score', 'tag_score',
       'device_type', 'device_ram', 'device_rom', 'sex', 'age', 'education',
       'occupation_status', 'territory_score','launch_times', 
       'launch_times_31', 'launch_times_15', 'launch_times_7', 'playtime_31',
       'playtime_15', 'playtime_7']

    def __len__(self):
        return math.ceil(self.num / self.batch_size)

    def __getitem__(self,idx):
        batch_data = self.data.iloc[idx*self.batch_size:(idx+1)*self.batch_size]

        input_1 = np.array([i for i in batch_data.launch_seq_31])
        input_2 = np.array([i for i in batch_data.playtime_seq])
        input_3 = np.array([i for i in batch_data.duration_prefer])
        input_4 = np.array([i for i in batch_data.interact_prefer])
        input_5 = np.array(batch_data[self.fea])
        #以上特征要做成[[][][]]这样的形式读取
        
        output = np.array(batch_data.label)

        return (input_1, input_2, input_3, input_4, input_5), output

In [5]:
def build_model(seq_len,dur_seq_len,inter_seq_len, feature_num):
    input_1 = tf.keras.Input(shape=(seq_len,1))
    output_1 = tf.keras.layers.GRU(32)(input_1)

    input_2 = tf.keras.Input(shape=(seq_len,1))
    output_2 = tf.keras.layers.GRU(32)(input_2)
    
    input_3 = tf.keras.Input(shape=(inter_seq_len,1))
    output_3 = tf.keras.layers.GRU(11)(input_3)  #11
    
    input_4 = tf.keras.Input(shape=(dur_seq_len,1))
    output_4 = tf.keras.layers.GRU(16)(input_4)  #16
    
    input_5 = tf.keras.Input(shape=(feature_num, ))
    output_5 = tf.keras.layers.Dense(64, activation="relu")(input_5)

    output = tf.concat([output_1, output_2,output_3,output_4,output_5], -1)
#     output = tf.keras.layers.Dense(128, activation="relu")(output)
#     dp = tf.keras.layers.Dropout(0.15)(output)去掉涨了0.002
    output = tf.keras.layers.Dense(64, activation="relu")(output)
    output = tf.keras.layers.Dense(1, activation="relu")(output)

    model = tf.keras.Model(inputs=[input_1, input_2,input_3, input_4,input_5], outputs=output)

    return model

# train

In [6]:
# train data
train = pd.read_csv(data_dir + "train_data.txt", sep="\t")
train["launch_seq_31"] = train.launch_seq_31.apply(lambda x: json.loads(x))
train["playtime_seq"] = train.playtime_seq.apply(lambda x: json.loads(x))
train["duration_prefer"] = train.duration_prefer.apply(lambda x: json.loads(x))
train["interact_prefer"] = train.interact_prefer.apply(lambda x: json.loads(x))

In [7]:
train.columns

Index(['user_id', 'end_date', 'label', 'launch_date_len', 'launch_date',
       'launch_type', 'launch_times', 'launch_type_0', 'launch_type_1',
       'launch_type_01rate', 'start_end_launch', 'launch_seq_31',
       'launch_seq_15', 'launch_seq_7', 'launch_times_31', 'launch_times_15',
       'launch_times_7', 'playtime_31', 'playtime_15', 'playtime_7',
       'playtime_seq', 'duration_prefer', 'father_id_score', 'cast_id_score',
       'tag_score', 'device_type', 'device_ram', 'device_rom', 'sex', 'age',
       'education', 'occupation_status', 'territory_score', 'interact_prefer'],
      dtype='object')

In [8]:
# shuffle data
train = train.sample(frac=1).reset_index(drop=True)
#train = train.iloc[:-1]

In [9]:
len(train)

600001

In [10]:
test = pd.read_csv(data_dir + "test_data.txt", sep="\t")
test["launch_seq_31"] = test.launch_seq_31.apply(lambda x: json.loads(x))
test["playtime_seq"] = test.playtime_seq.apply(lambda x: json.loads(x))
test["duration_prefer"] = test.duration_prefer.apply(lambda x: json.loads(x))
test["interact_prefer"] = test.interact_prefer.apply(lambda x: json.loads(x))

In [11]:
#取部分特征做目标编码

In [12]:
def aiyiqi_metric(y_true,y_pred):
    y_true = list(y_true)
    y_pred = list(y_pred)
    score = 0
    for i in range(len(y_true)):
        score += abs(y_true[i]-y_pred[i])/7
    return 100*(1-score/len(y_true))

In [13]:
# # testing DataGenerator
# generator_test = DataGenerator(train[:20], batch_size=8)
# print(generator_test)
# for i, item in enumerate(generator_test):
#     if(i == len(generator_test)):
#         break
#     (input_1, input_2,input_3, input_4,input_5), output = item
#     print(i, input_1.shape, input_2.shape)
#     print(i, output.shape, output)

In [14]:
oof_pred = np.zeros(len(train))
test_preds = np.zeros(len(test))

y = train['label']
x = train.drop('label',axis=1)
kfold = StratifiedKFold(random_state=2021,n_splits=5,shuffle=True)

new_test = DataGenerator(test,256)
for kf,(train_idx,val_idx) in enumerate(kfold.split(x,y)):
    print('#####第{}折####'.format(kf+1))
    print('train len: {}'.format(len(train_idx)))
    print('val len: {}'.format(len(val_idx)))
    train_x = x.iloc[train_idx]
    train_y = y.iloc[train_idx]
    train_df = pd.concat([train_x,train_y],axis=1)
    val_x = x.iloc[val_idx]
    val_y = y.iloc[val_idx]
    val_df = pd.concat([val_x,val_y],axis=1)

    train_bt = DataGenerator(train_df,256)
    val_bt = DataGenerator(val_df,256)
    
    model = tf.keras.models.load_model('./data/model/best_model.h5')
    #model = build_model(seq_len=32,dur_seq_len=16,inter_seq_len=11,feature_num=18)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.00008),loss="mse",metrics=["mse"])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_mse", patience=3, restore_best_weights=True)
    lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(patience=2,monitor='val_mse', factor=0.1)
    best_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_dir.format(kf+1),save_best_only=True, save_weights_only=False,verbose=1)
    #model.fit(iter(train_bt),steps_per_epoch=len(train_bt),validation_data=iter(val_bt),validation_steps=len(val_bt),epochs=20,callbacks=[best_checkpoint,early_stopping,lr_reduce])
    #model.save('./data/model/model_fold{}.h5'.format(kf))
    model.fit_generator(generator=train_bt,
                    steps_per_epoch=len(train_bt),
                    epochs=20,
                    verbose=1,
                    validation_data=val_bt,
                    validation_steps=len(val_bt),
#                     use_multiprocessing=False,
#                     workers=1,
                    callbacks=[best_checkpoint,early_stopping,lr_reduce])
 
    #重新加载当前折最优的模型
    best_model = tf.keras.models.load_model(model_dir.format(kf+1))
    
    #验证集推理
    val_pred =  best_model.predict(val_bt, steps=len(val_bt))[:,0]
    oof_pred[val_idx] = val_pred
    
    #测试集推理
    test_pred =  best_model.predict(new_test, steps=len(new_test))[:,0]
    test_preds += test_pred
    
    
#计算整体验证集得分
y_true = train.label
score = aiyiqi_metric(y_true,oof_pred)
print('多折验证集总体得分：{}'.format(score))

#####第1折####
train len: 480000
val len: 120001




Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.59970, saving model to ./data/model\model_kfold_1.h5
Epoch 2/20

Epoch 00002: val_loss improved from 1.59970 to 1.59852, saving model to ./data/model\model_kfold_1.h5
Epoch 3/20

Epoch 00003: val_loss improved from 1.59852 to 1.59796, saving model to ./data/model\model_kfold_1.h5
Epoch 4/20

Epoch 00004: val_loss did not improve from 1.59796
Epoch 5/20

Epoch 00005: val_loss improved from 1.59796 to 1.59789, saving model to ./data/model\model_kfold_1.h5
Epoch 6/20

Epoch 00006: val_loss improved from 1.59789 to 1.59757, saving model to ./data/model\model_kfold_1.h5
Epoch 7/20

Epoch 00007: val_loss improved from 1.59757 to 1.59757, saving model to ./data/model\model_kfold_1.h5
Epoch 8/20

Epoch 00008: val_loss improved from 1.59757 to 1.59754, saving model to ./data/model\model_kfold_1.h5
Epoch 9/20

Epoch 00009: val_loss improved from 1.59754 to 1.59753, saving model to ./data/model\model_kfold_1.h5
Epoch 10/20

Epoch 00010: val


Epoch 00006: val_loss improved from 1.59176 to 1.59166, saving model to ./data/model\model_kfold_3.h5
Epoch 7/20

Epoch 00007: val_loss improved from 1.59166 to 1.59154, saving model to ./data/model\model_kfold_3.h5
Epoch 8/20

Epoch 00008: val_loss did not improve from 1.59154
Epoch 9/20

Epoch 00009: val_loss did not improve from 1.59154
Epoch 10/20

Epoch 00010: val_loss improved from 1.59154 to 1.59132, saving model to ./data/model\model_kfold_3.h5
Epoch 11/20

Epoch 00011: val_loss improved from 1.59132 to 1.59128, saving model to ./data/model\model_kfold_3.h5
Epoch 12/20

Epoch 00012: val_loss improved from 1.59128 to 1.59125, saving model to ./data/model\model_kfold_3.h5
Epoch 13/20

Epoch 00013: val_loss improved from 1.59125 to 1.59125, saving model to ./data/model\model_kfold_3.h5
Epoch 14/20

Epoch 00014: val_loss improved from 1.59125 to 1.59124, saving model to ./data/model\model_kfold_3.h5
Epoch 15/20

Epoch 00015: val_loss did not improve from 1.59124
Epoch 16/20

Epoch


Epoch 00010: val_loss did not improve from 1.61063
多折验证集总体得分：87.09650820267586


In [15]:
#保存测试集结果
prediction = test_preds/5
test['prediction'] = list(prediction)
res = test[["user_id", "prediction"]]
res.to_csv(submit_dir + "submit_{}.csv".format(score), index=False, header=False, float_format="%.2f")