In [1]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from datetime import datetime
from gensim.models.word2vec import Word2Vec
import xgboost as xgb
import json
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',50)
pd.set_option('display.max_columns',50)

In [2]:
train = pd.read_csv('./data/new_data/lgbtrain.csv')
test = pd.read_csv('./data/new_data/lgbtest.csv')
#seq_fea = ['launch_seq','playtime_seq','duration_prefer','interact_prefer']              

In [3]:
train.columns

Index(['user_id', 'end_date', 'label', 'launch_date', 'launch_type',
       'launch_times', 'launch_type_0', 'launch_type_1', 'launch_type_01rate',
       'start_end_launch',
       ...
       'interact_prefer_1', 'interact_prefer_2', 'interact_prefer_3',
       'interact_prefer_4', 'interact_prefer_5', 'interact_prefer_6',
       'interact_prefer_7', 'interact_prefer_8', 'interact_prefer_9',
       'interact_prefer_10'],
      dtype='object', length=124)

In [4]:
train.head()

Unnamed: 0,user_id,end_date,label,launch_date,launch_type,launch_times,launch_type_0,launch_type_1,launch_type_01rate,start_end_launch,launch_seq_31,launch_seq_15,launch_seq_7,launch_times_31,launch_times_15,launch_times_7,playtime_31,playtime_15,playtime_7,playtime_seq,duration_prefer,father_id_score,cast_id_score,tag_score,device_type,...,duration_prefer_2,duration_prefer_3,duration_prefer_4,duration_prefer_5,duration_prefer_6,duration_prefer_7,duration_prefer_8,duration_prefer_9,duration_prefer_10,duration_prefer_11,duration_prefer_12,duration_prefer_13,duration_prefer_14,duration_prefer_15,interact_prefer_0,interact_prefer_1,interact_prefer_2,interact_prefer_3,interact_prefer_4,interact_prefer_5,interact_prefer_6,interact_prefer_7,interact_prefer_8,interact_prefer_9,interact_prefer_10
0,10000000,203,0,"[131, 132, 141, 164, 179, 185, 187, 189, 191, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.93343,0.958154,-0.611804,-0.528554,2.491569,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0]","[1, 1, 1, 0, 0, 0, 1, 0]",1.23221,1.783868,1.241501,-0.350271,-0.321856,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.289467,1.205949,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000001,214,0,"[114, 117, 118]","[0, 0, 0]",-0.395931,-0.356179,-0.611804,-0.528554,-0.716868,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.0,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,0.0,0.0,2.431832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000002,139,2,"[128, 129]","[0, 0]",-0.498189,-0.457282,-0.611804,-0.528554,-0.86053,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.440953,-0.224633,-0.776257,-0.39101,-0.390543,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,-1.536996,0.0,-2.041925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000003,163,0,"[144, 144]","[1, 0]",-0.498189,-0.558385,1.141663,1.146369,-0.908417,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.60827,-0.798491,-0.776257,-0.35938,-0.391076,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.8136, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,0.0,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000004,134,0,[],[],-0.702707,-0.659487,-0.611804,-0.528554,-0.908417,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.0,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,0.0,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
for i,j in zip(train,train.dtypes):
    if j == 'object':
        print(i)

launch_date
launch_type
launch_seq_31
launch_seq_15
launch_seq_7
playtime_seq
duration_prefer
interact_prefer


In [6]:
chuanyue_fea = ['start_end_launch','launch_type_0', 'launch_type_1', 'launch_type_01rate']
object_fea = ['launch_type','launch_seq_31','launch_seq_15','launch_seq_7','playtime_seq']

In [7]:
test['label']=-1

In [8]:
len(test)

15001

In [9]:
data = pd.concat([train,test],axis=0)
data

Unnamed: 0,user_id,end_date,label,launch_date,launch_type,launch_times,launch_type_0,launch_type_1,launch_type_01rate,start_end_launch,launch_seq_31,launch_seq_15,launch_seq_7,launch_times_31,launch_times_15,launch_times_7,playtime_31,playtime_15,playtime_7,playtime_seq,duration_prefer,father_id_score,cast_id_score,tag_score,device_type,...,duration_prefer_2,duration_prefer_3,duration_prefer_4,duration_prefer_5,duration_prefer_6,duration_prefer_7,duration_prefer_8,duration_prefer_9,duration_prefer_10,duration_prefer_11,duration_prefer_12,duration_prefer_13,duration_prefer_14,duration_prefer_15,interact_prefer_0,interact_prefer_1,interact_prefer_2,interact_prefer_3,interact_prefer_4,interact_prefer_5,interact_prefer_6,interact_prefer_7,interact_prefer_8,interact_prefer_9,interact_prefer_10
0,10000000,203,0,"[131, 132, 141, 164, 179, 185, 187, 189, 191, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.933430,0.958154,-0.611804,-0.528554,2.491569,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0]","[1, 1, 1, 0, 0, 0, 1, 0]",1.232210,1.783868,1.241501,-0.350271,-0.321856,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.289467,1.205949,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000001,214,0,"[114, 117, 118]","[0, 0, 0]",-0.395931,-0.356179,-0.611804,-0.528554,-0.716868,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.000000,0.000000,0.000000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.0,2.431832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000002,139,2,"[128, 129]","[0, 0]",-0.498189,-0.457282,-0.611804,-0.528554,-0.860530,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.440953,-0.224633,-0.776257,-0.391010,-0.390543,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000,-1.536996,0.0,-2.041925,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000003,163,0,"[144, 144]","[1, 0]",-0.498189,-0.558385,1.141663,1.146369,-0.908417,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.608270,-0.798491,-0.776257,-0.359380,-0.391076,-0.373771,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.8136, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000004,134,0,[],[],-0.702707,-0.659487,-0.611804,-0.528554,-0.908417,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.000000,0.000000,0.000000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14996,10355586,205,-1,"[115, 118, 124]","[1, 0, 0]",-0.395931,-0.457282,1.141663,0.588061,-0.477433,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.000000,0.000000,0.000000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,10589773,210,-1,"[131, 132, 133, 134, 135, 137, 138, 139, 140, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.068942,7.024308,-0.611804,-0.528554,2.874666,"[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1]",4.076587,3.218512,3.259258,-0.319115,-0.330408,-0.368000,"[0.8517, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.5, 0.0, 0.0, ...",-0.659227,-1.985434,0.0,0.194954,...,0.0,0.0,1.0,0.5,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,10181954,218,-1,"[142, 143, 144, 145, 150, 151, 152, 153, 154, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.626654,0.654846,-0.611804,-0.528554,-0.094336,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",-0.775586,-0.798491,-0.776257,0.000000,0.000000,0.000000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.0,0.194954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14999,10544736,164,-1,"[105, 106, 112, 113, 114, 115, 116, 117, 138, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.626654,0.654846,-0.611804,-0.528554,1.629601,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0]",0.060995,0.062296,-0.271817,-0.354178,-0.332943,-0.290420,"[0, 0, 0, 0, 0, 0.0621, 0, 0, 0, 0, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.5, 0.5, 0.0, 0.0, ...",0.560821,0.752010,0.0,2.431832,...,0.0,0.0,1.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
len(data)

615002

In [11]:
data = data.drop(chuanyue_fea+object_fea,axis=1)

In [12]:
#计算launch_date的序列长度
launch_date_len = []
for i in data.launch_date:
    launch_date_len.append(len(i))
data['launch_date_len'] = launch_date_len

In [13]:
# def kfold_mean(df_train, df_test, target, target_mean_list):
#     folds = StratifiedKFold(n_splits=5)

#     mean_of_target = df_train[target].mean()

#     for fold_, (trn_idx, val_idx) in tqdm(enumerate(folds.split(df_train, y=df_train['label']))):
#         tr_x = df_train.iloc[trn_idx, :]
#         vl_x = df_train.iloc[val_idx, :]

#         for col in target_mean_list:
#             df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(tr_x.groupby(col)[target].mean())

#     for col in target_mean_list:
#         df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

#         df_test[f'{col}_target_enc'] = df_test[col].map(df_train.groupby(col)[f'{col}_target_enc'].mean())

#         df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
#     return pd.concat([df_train, df_test], ignore_index=True)

# feature_list =  ['launch_date_len']
# data = kfold_mean(data.iloc[:600001], data[600001:],'label',feature_list)
# print(data)

In [14]:
# data['launch_date'] = data['launch_date'].apply(lambda x: eval(x))
# sentences = data['launch_date'].values.tolist()
# for i in range(len(sentences)):
#     sentences[i] = [str(x) for x in sentences[i]]   #将每个tagid转化成str格式
# print('预处理完毕')
# emb_size = 32
# model = Word2Vec(sentences, size=emb_size, sg=1, hs=1 ,seed=1, iter=3)#sentences, size=emb_size, window=3, min_count=5, sg=1, hs=1 ,seed=1, iter=3
# print("训练完毕")
# model.save('./data/w2v_model/launch_date.model')

# emb_matrix = []
# for seq in sentences:
#     vec = []
#     for w in seq:
#         if w in model.wv.vocab:
#             vec.append(model.wv[w])
#     if len(vec) > 0:
#         emb_matrix.append(np.mean(vec, axis=0))
#     else:
#         emb_matrix.append([0] * emb_size)
# emb_matrix = np.array(emb_matrix)
# for i in range(emb_size):
#     data['launch_date_emb_{}'.format(i)] = emb_matrix[:, i]
    


In [15]:
# for i,j in zip(data,data.dtypes):
#     if j == 'object':
#         print(i)

In [16]:
#特征筛选样本
# dropfea = ['duration_prefer_0', 'duration_prefer_14', 'duration_prefer_15', 'interact_prefer_5']

In [13]:
train = data.iloc[:600001]
test = data.iloc[600001:]

In [17]:
features = [i for i in train.columns if i not in ['label', 'end_date',"launch_date","interact_type","date_list",'launch_seq','playtime_seq','duration_prefer','interact_prefer']]  #将用户pid也加进去看看效果

y = train['label']
x = train[features]

new_test = test[features]
KF = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros((len(new_test),8))

# 特征重要性
feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})

model = lgb.LGBMClassifier( num_leaves=32,
                           max_depth=6,
                           learning_rate=0.08,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           objective='multiclass', #''regression'
                           num_class=8,
                           metric='multi_logloss',#'mse',
                           #metric='mse',
                           device='gpu')
# 十折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(x,y)):
    print("##########第{}折############".format(fold_+1))

    Xtrain = x.iloc[trn_idx]
    Ytrain = y.iloc[trn_idx]

    X_val = x.iloc[val_idx]
    Y_val = y.iloc[val_idx]

    model = model.fit(Xtrain,
                      Ytrain,
                      eval_metric="multi_logloss",
                      eval_set=[(X_val, Y_val)],
                      verbose=100,
                      early_stopping_rounds=100
                    )

    feat_imp_df['imp'] += model.feature_importances_ / 5
    oof_lgb[val_idx] = model.predict(X_val,num_iteration=model.best_iteration_)
    predictions_lgb += model.predict_proba(new_test,num_iteration=model.best_iteration_)
    print()

##########第1折############
[100]	valid_0's multi_logloss: 1.21785
[200]	valid_0's multi_logloss: 1.21656

##########第2折############
[100]	valid_0's multi_logloss: 1.21689
[200]	valid_0's multi_logloss: 1.21595

##########第3折############
[100]	valid_0's multi_logloss: 1.22053
[200]	valid_0's multi_logloss: 1.21943
[300]	valid_0's multi_logloss: 1.22

##########第4折############
[100]	valid_0's multi_logloss: 1.21954
[200]	valid_0's multi_logloss: 1.21846

##########第5折############
[100]	valid_0's multi_logloss: 1.21613
[200]	valid_0's multi_logloss: 1.21524
[300]	valid_0's multi_logloss: 1.21567



In [28]:
tmp = [np.argmax(i) for i in predictions_lgb]
tmp

[0,
 0,
 0,
 0,
 0,
 1,
 3,
 4,
 0,
 7,
 0,
 1,
 0,
 1,
 0,
 6,
 0,
 7,
 0,
 0,
 7,
 7,
 3,
 7,
 0,
 0,
 4,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 7,
 5,
 0,
 6,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 7,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 6,
 7,
 0,
 0,
 0,
 2,
 0,
 0,
 3,
 5,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 3,
 0,
 1,
 7,
 7,
 6,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 7,
 0,
 2,
 0,
 0,
 1,
 0,
 7,
 0,
 0,
 0,
 7,
 0,
 0,
 3,
 2,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 6,
 0,
 0,
 0,
 7,
 0,
 1,
 3,
 0,
 1,
 0,
 5,
 2,
 5,
 7,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 7,
 2,
 0,
 0,
 4,
 7,
 1,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 7,
 0,
 6,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 5,
 0,
 7,
 0,
 5,
 0,
 0,
 0,
 7,
 0,
 0,
 1,
 7,
 0,
 7,
 0,
 0,
 0,
 2,
 0,
 0,
 7,
 2,
 0,
 0,
 1,
 7,
 7,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 1,
 7,
 7,
 0,
 1,
 0,
 6,
 6,
 1,
 0,
 0,
 2,
 0,
 0,


In [16]:
t =  model.predict_proba(new_test,num_iteration=model.best_iteration_)
t,len(t)

(array([[9.34245205e-01, 4.57088330e-02, 1.07272689e-02, ...,
         1.08554605e-03, 5.16587315e-04, 4.30904019e-04],
        [9.66826701e-01, 2.37331038e-02, 5.10931298e-03, ...,
         5.20124961e-04, 3.02322504e-04, 2.79315378e-04],
        [3.79736616e-01, 3.31260191e-01, 1.81775010e-01, ...,
         9.01524787e-03, 4.89680055e-03, 1.68605432e-03],
        ...,
        [7.91876010e-01, 1.39373221e-01, 3.50475657e-02, ...,
         2.35449988e-03, 6.62891097e-03, 1.08672259e-03],
        [3.28778761e-01, 3.13010356e-01, 1.84705202e-01, ...,
         2.19909186e-02, 1.06067807e-02, 2.57910139e-03],
        [5.07788128e-01, 3.61445289e-01, 8.81904001e-02, ...,
         3.67961110e-03, 1.78772336e-03, 8.59502868e-04]]),
 15001)

In [20]:
def aiyiqi_metric(y_true,y_pred):
    y_true = list(y_true)
    y_pred = list(y_pred)
    score = 0
    for i in range(len(y_true)):
        score += abs(y_true[i]-y_pred[i])/7
    return 100*(1-score/len(y_true))

In [21]:
score = aiyiqi_metric(y,oof_lgb)
print("aiqyiqi score: {}".format(score))

aiqyiqi score: 87.0804387155972


In [22]:
submit = test[['user_id']]
submit['pred'] = predictions_lgb/5
submit.columns = ['user_id', 'pred']
submit.to_csv("./data/submit/lgb_submit_Nonormalize_{}.csv".format(score), index=False, header=False, float_format="%.2f")

In [None]:
submit.iloc[:,1].value_counts()