@Copyright IQIYI 2021
http://challenge.ai.iqiyi.com/

In [1]:
import pandas as pd
import numpy as np
import json
import math
import tensorflow as tf
import os
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import Sequence
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

In [2]:
data_dir = "./data/new_data_B/"
model_dir = "./data/model/best_model.h5"
submit_dir = "./data/submit/"

In [3]:
#制作一个迭代器，迭代器里面的每个元素是一个bt=n的step
#https://blog.csdn.net/weixin_37737254/article/details/103884255
class DataGenerator(Sequence):
    def __init__(self, df, batch_size):
        self.data = df
        self.num = df.shape[0]
        self.batch_size = batch_size
        self.fea = ['father_id_score', 'cast_id_score', 'tag_score',
       'device_type', 'device_ram', 'device_rom', 'sex', 'age', 'education',
       'occupation_status', 'territory_score','launch_times', 
       'launch_times_31', 'launch_times_15', 'launch_times_7', 'playtime_31',
       'playtime_15', 'playtime_7']#'launch_date_len_target_enc','start_end_launch',目前最佳只有钱18个,'launch_date_len','launch_type_0', 'launch_type_1'

    def __len__(self):
        return math.ceil(self.num / self.batch_size)

    def __getitem__(self,idx):
        batch_data = self.data.iloc[idx*self.batch_size:(idx+1)*self.batch_size]

        input_1 = np.array([i for i in batch_data.launch_seq_31])
        input_2 = np.array([i for i in batch_data.playtime_seq])
        input_3 = np.array([i for i in batch_data.duration_prefer])
        input_4 = np.array([i for i in batch_data.interact_prefer])
        input_5 = np.array(batch_data[self.fea])
        #以上特征要做成[[][][]]这样的形式读取
        
        output = np.array(batch_data.label)

        return (input_1, input_2, input_3, input_4, input_5), output

In [4]:
testA = pd.read_csv("./data/new_data/test_data.txt", sep="\t")
testA["launch_seq_31"] = testA.launch_seq_31.apply(lambda x: json.loads(x))
testA["playtime_seq"] = testA.playtime_seq.apply(lambda x: json.loads(x))
testA["duration_prefer"] = testA.duration_prefer.apply(lambda x: json.loads(x))
testA["interact_prefer"] = testA.interact_prefer.apply(lambda x: json.loads(x))

In [5]:
testB = pd.read_csv("./data/new_data_B/test_data.txt", sep="\t")
testB["launch_seq_31"] = testB.launch_seq_31.apply(lambda x: json.loads(x))
testB["playtime_seq"] = testB.playtime_seq.apply(lambda x: json.loads(x))
testB["duration_prefer"] = testB.duration_prefer.apply(lambda x: json.loads(x))
testB["interact_prefer"] = testB.interact_prefer.apply(lambda x: json.loads(x))

# predict

In [6]:
new_test_A = DataGenerator(testA,100)
new_test_B = DataGenerator(testB,100)
#重新加载当前折最优的模型
best_model = tf.keras.models.load_model(model_dir)
#测试集推理
test_predA =  best_model.predict(new_test_A, steps=len(new_test_A))[:,0]
test_predB =  best_model.predict(new_test_B, steps=len(new_test_B))[:,0]

testA['prediction'] = list(test_predA)
testB['prediction'] = list(test_predB)

In [7]:
def select(x):
    if x<0.3:
        return 0
    elif x>0.7 and x<1.3:
        return 1
    elif x>1.7 and x<2.3:
        return 2
    elif x>2.7 and x<3.3:
        return 3
    elif x>3.7 and x<4.3:
        return 4
    elif x>4.7 and x<5.3:
        return 5
    elif x>5.7 and x<6.3:
        return 6
    elif x>6.7 and x<7.3:
        return 7
    else:
        return -1

In [8]:
tmpA = testA['prediction'].apply(lambda x:select(x))
tmpB = testB['prediction'].apply(lambda x:select(x))

testA['label'] = list(tmpA)
testB['label'] = list(tmpB)

test = pd.concat([testA,testB],axis=0)
test = test[test['label']!=-1]
print(test.head(),len(test))

    user_id  end_date  label                                    launch_date  \
0  10007813       205      0                                     [118, 141]   
1  10052988       210      0                                     [147, 149]   
2  10279068       200      1       [134, 158, 178, 179, 180, 181, 196, 197]   
3  10546696       216      1  [156, 178, 179, 184, 185, 187, 204, 207, 208]   
4  10406659       183      0                                [104, 113, 141]   

                   launch_type  launch_times  launch_type_0  launch_type_1  \
0                       [0, 1]     -0.498189      -0.558385       1.141663   
1                       [0, 0]     -0.498189      -0.457282      -0.611804   
2     [1, 1, 1, 0, 0, 0, 0, 0]      0.115362      -0.153974       4.648597   
3  [0, 1, 0, 0, 0, 0, 0, 0, 0]      0.217620       0.149333       1.141663   
4                    [0, 0, 0]     -0.395931      -0.356179      -0.611804   

   launch_type_01rate  start_end_launch  ... device_ram 

In [9]:
# train data
train = pd.read_csv(data_dir + "train_data.txt", sep="\t")
train["launch_seq_31"] = train.launch_seq_31.apply(lambda x: json.loads(x))
train["playtime_seq"] = train.playtime_seq.apply(lambda x: json.loads(x))
train["duration_prefer"] = train.duration_prefer.apply(lambda x: json.loads(x))
train["interact_prefer"] = train.interact_prefer.apply(lambda x: json.loads(x))

In [10]:
final_train = pd.concat([train,test],axis=0)

In [11]:
def aiyiqi_metric(y_true,y_pred):
    y_true = list(y_true)
    y_pred = list(y_pred)
    score = 0
    for i in range(len(y_true)):
        score += abs(y_true[i]-y_pred[i])/7
    return 100*(1-score/len(y_true))

In [13]:
oof_pred = np.zeros(len(final_train))
test_preds = np.zeros(len(testB))

y = final_train['label']
x = final_train.drop('label',axis=1)
kfold = StratifiedKFold(random_state=2021,n_splits=5,shuffle=True)

for kf,(train_idx,val_idx) in enumerate(kfold.split(x,y)):
    print('#####第{}折####'.format(kf+1))
    print('train len: {}'.format(len(train_idx)))
    print('val len: {}'.format(len(val_idx)))
    train_x = x.iloc[train_idx]
    train_y = y.iloc[train_idx]
    train_df = pd.concat([train_x,train_y],axis=1)
    val_x = x.iloc[val_idx]
    val_y = y.iloc[val_idx]
    val_df = pd.concat([val_x,val_y],axis=1)

    train_bt = DataGenerator(train_df,256)
    val_bt = DataGenerator(val_df,256)
    
    model = tf.keras.models.load_model('./data/model/best_model.h5')
    #model = build_model(seq_len=32,dur_seq_len=16,inter_seq_len=11,feature_num=18)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.0008),loss="mse",metrics=["mse"])
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_mse", patience=3, restore_best_weights=True)
    lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(patience=2,monitor='val_mse', factor=0.1)
    best_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_dir.format(kf+1),save_best_only=True, save_weights_only=False,verbose=1)
    #model.fit(iter(train_bt),steps_per_epoch=len(train_bt),validation_data=iter(val_bt),validation_steps=len(val_bt),epochs=20,callbacks=[best_checkpoint,early_stopping,lr_reduce])
    #model.save('./data/model/model_fold{}.h5'.format(kf))
    model.fit_generator(generator=train_bt,
                    steps_per_epoch=len(train_bt),
                    epochs=20,
                    verbose=1,
                    validation_data=val_bt,
                    validation_steps=len(val_bt),
#                     use_multiprocessing=False,
#                     workers=1,
                    callbacks=[best_checkpoint,early_stopping,lr_reduce])
 
    #重新加载当前折最优的模型
    best_model = tf.keras.models.load_model(model_dir.format(kf+1))
    
    #验证集推理
    val_pred =  best_model.predict(val_bt, steps=len(val_bt))[:,0]
    oof_pred[val_idx] = val_pred
    
    #测试集推理
    test_pred =  best_model.predict(new_test_B, steps=len(new_test_B))[:,0]
    test_preds += test_pred
    
    
#计算整体验证集得分
y_true = final_train.label
score = aiyiqi_metric(y_true,oof_pred)
print('多折验证集总体得分：{}'.format(score))

#####第1折####
train len: 502774
val len: 125694




Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.52512, saving model to ./data/model\best_model.h5
Epoch 2/20

Epoch 00002: val_loss improved from 1.52512 to 1.52488, saving model to ./data/model\best_model.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 1.52488
Epoch 4/20

Epoch 00004: val_loss improved from 1.52488 to 1.52464, saving model to ./data/model\best_model.h5
Epoch 5/20

Epoch 00005: val_loss did not improve from 1.52464
Epoch 6/20

Epoch 00006: val_loss did not improve from 1.52464
Epoch 7/20

Epoch 00007: val_loss improved from 1.52464 to 1.52209, saving model to ./data/model\best_model.h5
Epoch 8/20

Epoch 00008: val_loss improved from 1.52209 to 1.52142, saving model to ./data/model\best_model.h5
Epoch 9/20

Epoch 00009: val_loss did not improve from 1.52142
Epoch 10/20

Epoch 00010: val_loss did not improve from 1.52142
Epoch 11/20

Epoch 00011: val_loss did not improve from 1.52142
#####第2折####
train len: 502774
val len: 125694
Epoch 1/20

Epoch 0000

NameError: name 'aiyiqi_metric' is not defined

In [13]:
new_train = DataGenerator(final_train[10000:],100)
new_val = DataGenerator(final_train.iloc[:10000],100)
        
model = tf.keras.models.load_model('./data/model/best_model.h5')
model.compile(optimizer=tf.keras.optimizers.Adam(0.0008),loss="mse",metrics=["mse"])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_mse", patience=3, restore_best_weights=True)
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(patience=2,monitor='val_mse', factor=0.1)
best_checkpoint = tf.keras.callbacks.ModelCheckpoint('./data/model/model.h5',save_best_only=True, save_weights_only=False,verbose=1)
#model.fit(iter(train_bt),steps_per_epoch=len(train_bt),validation_data=iter(val_bt),validation_steps=len(val_bt),epochs=20,callbacks=[best_checkpoint,early_stopping,lr_reduce])
#model.save('./data/model/model_fold{}.h5'.format(kf))
model.fit_generator(generator=new_train,
                    steps_per_epoch=len(new_train),
                    epochs=20,
                    verbose=1,
                    validation_data=new_val,
                    validation_steps=len(new_val),
#                     use_multiprocessing=False,
#                     workers=1,
                    callbacks=[best_checkpoint,early_stopping,lr_reduce])
    
#重新加载当前折最优的模型
best_model = tf.keras.models.load_model('./data/model/model.h5')
#测试集推理
test_pred =  best_model.predict(new_test_B, steps=len(new_test_B))[:,0]
 
#验证集推理
val_pred =  best_model.predict(new_val, steps=len(new_val))[:,0]

#计算整体验证集得分
y_true = final_train.iloc[:10000]['label']
score = aiyiqi_metric(y_true,val_pred)
print('得分：{}'.format(score))

Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.60212, saving model to ./data/model\model.h5
Epoch 2/20

Epoch 00002: val_loss did not improve from 1.60212
Epoch 3/20

Epoch 00003: val_loss did not improve from 1.60212
Epoch 4/20

Epoch 00004: val_loss improved from 1.60212 to 1.59892, saving model to ./data/model\model.h5
Epoch 5/20

Epoch 00005: val_loss did not improve from 1.59892
Epoch 6/20

Epoch 00006: val_loss did not improve from 1.59892
Epoch 7/20

Epoch 00007: val_loss did not improve from 1.59892
得分：87.1214099419968


In [14]:
#保存测试集结果
prediction = test_pred
testB['prediction'] = list(prediction)
res = testB[["user_id", "prediction"]]
res.to_csv(submit_dir + "submit_B.csv", index=False, header=False, float_format="%.2f")