In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold 
import gc
import random
import time
import os
import pickle
from utils import reduce_mem, uAUC, ProNE, HyperParam, get_logger, fast_auc

pd.set_option('display.max_columns', None)

# 读取特征

## 读取数值特征

In [2]:
%%time
df1 = pd.read_feather("data/features/df_stat1_v3.feather")
df2 = pd.read_feather("data/features/df_stat2_v3.feather")
df3 = pd.read_feather("data/features/df_stat2_v2.feather")
print(df1.shape, df2.shape, df3.shape)

del_cols = ['is_watch', 'is_collect', 'is_comment', 'watch_playseconds',
            'watch_label_cls_1', 'watch_label_cls_2', 'watch_label_cls_3', 
            'watch_label_cls_4', 'watch_label_cls_5', 'watch_label_cls_6',
            'watch_label_cls_7', 'watch_label_cls_8', 'watch_label_cls_9']
df1.drop(columns=del_cols, inplace=True)

## 拼接特征df2
df = pd.concat([df1, df2[df2.columns[29:]]], axis=1)
del df1, df2
gc.collect()

## 拼接特征df3
df = pd.concat([df, df3[df3.columns[19:]]], axis=1)
print(df.shape)
del df3
gc.collect()

## 去除第一天的样本特征
df = df[df['date_'] != 1].reset_index(drop=True)


## 需要提交的
submit = df[df['date_'] == 15][['userid', 'feedid']].reset_index(drop=True)
print(df.shape, submit.shape)
df.head()

(83099036, 93) (83099036, 61) (83099036, 39)
(83099036, 132)
(77515926, 132) (2822180, 2)
CPU times: user 3min 56s, sys: 9min 49s, total: 13min 45s
Wall time: 15min 6s


## embedding特征

In [3]:
fid_w2v_emb = pd.read_pickle("data/features/fid_w2v_emb.pkl")
fid_tfidf_svd_emb = pd.read_pickle("data/features/fid_tfidf_svd_emb.pkl")
fid_prone_emb = pd.read_pickle("data/features/feed_prone_emb.pkl")

uid_tfidf_svd_emb = pd.read_pickle("data/features/uid_tfidf_svd_emb.pkl")
uid_prone_emb = pd.read_pickle("data/features/user_prone_emb.pkl")


fid_w2v_emb = reduce_mem(fid_w2v_emb)
fid_tfidf_svd_emb = reduce_mem(fid_tfidf_svd_emb)
fid_prone_emb = reduce_mem(fid_prone_emb)
uid_tfidf_svd_emb = reduce_mem(uid_tfidf_svd_emb)
uid_prone_emb = reduce_mem(uid_prone_emb)

print(fid_w2v_emb.shape, fid_tfidf_svd_emb.shape, fid_prone_emb.shape, uid_tfidf_svd_emb.shape, uid_prone_emb.shape)


## 合并
fid_emb_df = fid_w2v_emb.merge(fid_tfidf_svd_emb, how='left', on=['feedid'])
fid_emb_df = fid_emb_df.merge(fid_prone_emb, how='left', on=['feedid'])
uid_emb_df = uid_tfidf_svd_emb.merge(uid_prone_emb, how='left', on=['userid'])
print(uid_emb_df.shape, fid_emb_df.shape)

del fid_w2v_emb, fid_tfidf_svd_emb, uid_tfidf_svd_emb, uid_prone_emb, fid_prone_emb
gc.collect()

100%|██████████| 33/33 [00:00<00:00, 495.14it/s]
100%|██████████| 33/33 [00:00<00:00, 982.99it/s]
100%|██████████| 33/33 [00:00<00:00, 549.24it/s]
  0%|          | 0/33 [00:00<?, ?it/s]

4.32 Mb, 2.22 Mb (48.48 %)
4.71 Mb, 2.49 Mb (47.22 %)
4.32 Mb, 2.22 Mb (48.48 %)


100%|██████████| 33/33 [00:01<00:00, 22.56it/s]
  6%|▌         | 2/33 [00:00<00:02, 11.73it/s]

552.51 Mb, 291.60 Mb (47.22 %)


100%|██████████| 33/33 [00:03<00:00,  8.58it/s]


506.47 Mb, 260.91 Mb (48.48 %)
(34304, 33) (34304, 33) (34304, 33) (4023240, 33) (4023240, 33)
(4023240, 65) (34304, 97)


0

## 切分训练、验证、测试

In [4]:
##  For模型验证效果
valid_14 = df[df['date_'] == 14].reset_index(drop=True)
print(valid_14.shape)

(5529636, 132)


In [5]:
%%time

train_tmp = df[df['date_'] <= 14]
test = df[df['date_'] == 15].reset_index(drop=True)
print(train_tmp.shape, test.shape)

del df
gc.collect()

y_list = ['is_share', 'watch_label']
train_tmp['y_sum'] = train_tmp[y_list].sum(axis=1)
train_pos = train_tmp[train_tmp['y_sum'] >= 1]
train_neg = train_tmp[train_tmp['y_sum'] == 0].sample(frac=0.15, random_state=202108)

print("正负样本个数：", train_pos.shape, train_neg.shape)
train = pd.concat([train_pos, train_neg], ignore_index=True)
valid = train[train['date_'] >= 14].reset_index(drop=True)
train = train[train['date_'] < 14]
train = train.sample(frac=1.0).reset_index(drop=True)
print(train.shape, valid.shape, test.shape)

del train_tmp, train_pos, train_neg, train['y_sum']
gc.collect()

(74693746, 132) (2822180, 132)
正负样本个数： (2029985, 133) (10899564, 133)
(11976009, 133) (953540, 133) (2822180, 132)
CPU times: user 2min 29s, sys: 5min 1s, total: 7min 30s
Wall time: 9min 37s


## 拼接embedding特征

In [6]:
%%time

train = train.merge(fid_emb_df, how='left', on=['feedid'])
train = train.merge(uid_emb_df, how='left', on=['userid'])
print(train.shape)

valid = valid.merge(fid_emb_df, how='left', on=['feedid'])
valid = valid.merge(uid_emb_df, how='left', on=['userid'])

test = test.merge(fid_emb_df, how='left', on=['feedid'])
test = test.merge(uid_emb_df, how='left', on=['userid'])

valid_14 = valid_14.merge(fid_emb_df, how='left', on=['feedid'])
valid_14 = valid_14.merge(uid_emb_df, how='left', on=['userid'])
print(valid.shape, valid_14.shape, test.shape)

(11976009, 292)
(953540, 293) (5529636, 292) (2822180, 292)
CPU times: user 1min 34s, sys: 43.2 s, total: 2min 17s
Wall time: 2min 21s


In [7]:
train.to_feather("data/nn_features/train.feather")
valid.to_feather("data/nn_features/valid.feather")
valid_14.to_feather("data/nn_features/valid_14.feather")
test.to_feather("data/nn_features/test.feather")

In [2]:
test = pd.read_feather("data/nn_features/test.feather")

## 定义特征列

In [3]:
cate_cols = ['userid', 'feedid', 'age', 'gender', 'country', 'province', 'city', 'city_level', 'device_name']
y_list = ['is_watch', 'is_share', 'is_collect', 'is_comment', 'watch_label']

# for col in tqdm(cate_cols):
#     lbl = LabelEncoder()
#     df[col] = lbl.fit_transform(df[col])
    
## lgb训练模型所需要的特征列
cols = [f for f in test.columns if (f not in ['date_'] + y_list)]
print("特征总数：{}".format(len(cols)))

特征总数：289


In [4]:
train['is_share'].value_counts(), valid['is_share'].value_counts()

NameError: name 'train' is not defined

# 训练 is_share

In [10]:
%%time
clf2 = LGBMClassifier(
            learning_rate=0.01,
            n_estimators=2000,
            num_leaves=63,
            subsample=0.85,
            colsample_bytree=0.85,
            random_state=2021,
            metric='None',
            importance_type='gain',
            n_jobs=8)
    
clf2.fit(train[cols].values.astype(np.float32), 
         train['is_share'].values,
         eval_set=[(train[cols].values.astype(np.float32),  train['is_share'].values), 
                   (valid[cols].values.astype(np.float32),  valid['is_share'].values)],
         eval_names=['train', 'valid'],
         eval_metric='auc',
         early_stopping_rounds=50,
         verbose=100)
    
submit['is_share_pred'] = clf2.predict_proba(test[cols].values)[:, 1]
valid_14['is_share_pred'] = clf2.predict_proba(valid_14[cols].values)[:, 1]
pickle.dump(clf2, open("data/lgb_save_model/lgb_is_share_cls.pkl", 'wb'))

Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.856612	valid's auc: 0.839816
[200]	train's auc: 0.888037	valid's auc: 0.843516
[300]	train's auc: 0.914422	valid's auc: 0.847286
[400]	train's auc: 0.935715	valid's auc: 0.848583
[500]	train's auc: 0.947522	valid's auc: 0.850404
[600]	train's auc: 0.960006	valid's auc: 0.851375
Early stopping, best iteration is:
[601]	train's auc: 0.960054	valid's auc: 0.851394
CPU times: user 5h 29min 36s, sys: 7min 49s, total: 5h 37min 25s
Wall time: 55min 34s


In [6]:
'''
[523]	train's auc: 0.930459	valid's auc: 0.848794

'''

## 特征重要性
fea_imp = pd.DataFrame({'fea': cols, 'imp': clf2.feature_importances_})
fea_imp.sort_values('imp', inplace=True, ascending=False)
fea_imp.head(20)

Unnamed: 0,fea,imp
241,userid_feedid_tfidf_svd_16,248712.038439
15,userid_7day_is_share_mean,176396.836979
114,userid_date__cnt,174385.045159
112,global_userid_cnt,127996.751253
115,userid_date_ratio,121947.363485
33,feedid_7day_is_comment_mean,96468.43004
31,feedid_7day_is_share_mean,89181.929817
7,device_name,55767.861621
95,feedid_3day_is_share_mean,51849.631718
113,global_userid_dcnt,44592.176512


In [5]:
clf2 = pickle.load(open("data/lgb_save_model/lgb_is_share_cls.pkl", 'rb'))

# 训练 watch_label

In [9]:
train_y_watch = train['watch_label'].values
valid_y_watch = valid['watch_label'].values

def onehot_encode(nums, k):
    res = np.zeros((len(nums), k))
    for i, x in tqdm(enumerate(nums)):
        res[i, int(x)] = 1
    res = res.astype(np.float16)
    return res

train_y_watch = onehot_encode(train_y_watch, 10)
valid_y_watch = onehot_encode(valid_y_watch, 10)
print(train_y_watch.shape, valid_y_watch.shape)

11976009it [00:05, 2090202.99it/s]
953540it [00:00, 1955221.39it/s]


(11976009, 10) (953540, 10)


In [None]:
for i in range(1, 10):
    print("======= watch_label: {} =======".format(i))
    start_time = time.time()
    clf = LGBMClassifier(
            learning_rate=0.04,
            n_estimators=2000,
            num_leaves=63,
            subsample=0.5,
            colsample_bytree=0.6,
            random_state=2021,
            metric='None',
            importance_type='gain',
            n_jobs=8)
    
    clf.fit(train[cols].values.astype(np.float32), 
            train_y_watch[:,i],
            eval_set=[(train[cols].values.astype(np.float32),  train_y_watch[:,i]), 
                      (valid[cols].values.astype(np.float32),  valid_y_watch[:,i])],
            eval_names=['train', 'valid'],
            eval_metric='auc',
            early_stopping_rounds=80,
            verbose=100)
    
    submit['watch_label_pred_{}'.format(i)] = clf.predict_proba(test[cols].values)[:, 1]
    valid_14['watch_label_pred_{}'.format(i)] = clf.predict_proba(valid_14[cols].values)[:, 1]
    print("**** time cost {}(s) ****".format(round(time.time() - start_time)))
    pickle.dump(clf, open("data/lgb_save_model/lgb_watch_label_cls_{}.pkl".format(i), 'wb'))

Training until validation scores don't improve for 80 rounds
[100]	train's auc: 0.782439	valid's auc: 0.759254
[200]	train's auc: 0.793159	valid's auc: 0.768156
[300]	train's auc: 0.799643	valid's auc: 0.773095
[400]	train's auc: 0.803822	valid's auc: 0.775107
[500]	train's auc: 0.807536	valid's auc: 0.776893
[600]	train's auc: 0.810662	valid's auc: 0.777814
[700]	train's auc: 0.813766	valid's auc: 0.778787
[800]	train's auc: 0.816679	valid's auc: 0.779458
[900]	train's auc: 0.819371	valid's auc: 0.780145
[1000]	train's auc: 0.822035	valid's auc: 0.78068
[1100]	train's auc: 0.824477	valid's auc: 0.781081
[1200]	train's auc: 0.826891	valid's auc: 0.781537
[1300]	train's auc: 0.82943	valid's auc: 0.78204
[1400]	train's auc: 0.831733	valid's auc: 0.782367
[1500]	train's auc: 0.833971	valid's auc: 0.782805
[1600]	train's auc: 0.836127	valid's auc: 0.783127
[1700]	train's auc: 0.838216	valid's auc: 0.783426
[1800]	train's auc: 0.840305	valid's auc: 0.783701
[1900]	train's auc: 0.842218	vali

In [18]:
## 特征重要性
fea_imp = pd.DataFrame({'fea': cols, 'imp': clf.feature_importances_})
fea_imp.sort_values('imp', inplace=True, ascending=False)
fea_imp.head(20)

Unnamed: 0,fea,imp
44,feedid_7day_watch_label_cls_9_mean,1206915.0
114,userid_date__cnt,607221.0
28,userid_7day_watch_label_cls_9_mean,605313.8
108,feedid_3day_watch_label_cls_9_mean,585661.5
115,userid_date_ratio,276091.5
111,feedid_date__cnt,178603.4
127,global_userid_video_release_ndays_mean,138430.7
9,video_duration,130484.1
19,userid_7day_watch_playseconds_mean,121276.4
113,global_userid_dcnt,112749.1


In [22]:
valid_14['is_share_pred'] = clf2.predict_proba(valid_14[cols].values)[:, 1]

In [57]:
# def get_deal_y_pred(valid_14):
#     watch_label = np.array([650000] * 9) * np.array([1.00, 0.5, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1]) * 0.51
    
#     y_pred = np.zeros(valid_14.shape[0])
#     idx_vis = set()
#     for i, n in enumerate(watch_label):
#         idxes = np.argsort(valid_14['watch_label_pred_{}'.format(9-i)]).values
#         idxes = idxes[::-1]
#         idx = 0
#         cnt = 0
#         while cnt < n:
#             if idxes[idx] not in idx_vis:
#                 y_pred[idxes[idx]] = 9 - i
#                 cnt += 1
#                 idx_vis.add(idxes[idx])
#             idx += 1
#         print('cls:', 9-i, idx, cnt)
#     return y_pred


# submit['watch_label'] = get_deal_y_pred(submit)

# ## for submit
# submit_final = submit[['userid', 'feedid', 'watch_label', 'is_share_pred']]
# submit_final.columns = ['user_id', 'video_id', 'watch_label', 'is_share']
# print(submit_final.shape)
# submit_final['watch_label'] = submit_final['watch_label'].astype(int)
# submit_final['is_share'] = np.round(submit_final['is_share'], 7)

# submit_final.to_csv("data/submit/temp/submission.csv", index=None)

In [24]:
submit

Unnamed: 0,userid,feedid,is_share_pred
0,1688013,32645,0.000702
1,4502598,41270,0.010934
2,5585629,16345,0.000160
3,1635520,28149,0.000664
4,4160191,40554,0.000574
...,...,...,...
2822175,5019057,18766,0.000848
2822176,5019057,12968,0.002034
2822177,4255762,21794,0.000469
2822178,171497,21578,0.000148


In [25]:
submit_final = pd.read_csv("data/submit/temp/submission.csv")
submit_final['is_share'] = np.round(submit['is_share_pred'], 7)
submit_final.to_csv("data/submit/submission.csv", index=None)

In [26]:
submit_final

Unnamed: 0,user_id,video_id,watch_label,is_share
0,1688013,32645,6,0.000702
1,4502598,41270,0,0.010934
2,5585629,16345,5,0.000160
3,1635520,28149,9,0.000664
4,4160191,40554,0,0.000574
...,...,...,...,...
2822175,5019057,18766,0,0.000848
2822176,5019057,12968,0,0.002034
2822177,4255762,21794,8,0.000469
2822178,171497,21578,0,0.000148


# 根据valid_14评估模型

In [23]:
## is_share AUC
y1_auc = roc_auc_score(valid_14['is_share'], valid_14['is_share_pred'])
print(y1_auc)

'''
0.8539189728196664

0.8584187404378384
'''

0.8584187404378384


'\n0.8539189728196664\n\n'

In [58]:
def onehot_encode(nums, k):
    res = np.zeros((len(nums), k))
    for i, x in (enumerate(nums)):
        res[i, int(x)] = 1
    res = res.astype(np.int32)
    return res


watch_y_true = onehot_encode(valid_14['watch_label'].values, 10)

In [95]:
## 得到后处理的预测标签
def get_deal_y_pred(valid_14):
    watch_label = np.array([int(valid_14.shape[0] * 0.119)] * 9) * np.array([1.00, 0.5, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1])
    
    y_pred = np.zeros(valid_14.shape[0])
    idx_vis = set()
    for i, n in enumerate(watch_label):
        idxes = np.argsort(valid_14['watch_label_pred_{}'.format(9-i)]).values
        idxes = idxes[::-1]
        idx = 0
        cnt = 0
        while cnt < n:
            if idxes[idx] not in idx_vis:
                y_pred[idxes[idx]] = 9 - i
                cnt += 1
                idx_vis.add(idxes[idx])
            idx += 1
        print('cls:', 9-i, idx, cnt)
    return y_pred

In [96]:
%%time

## 计算 watch_label的AUC
valid_14['watch_label_pred'] = get_deal_y_pred(valid_14)
watch_y_pred = onehot_encode(valid_14['watch_label_pred'].values, 10)

auc_list = []
for i in range(1, 10):
    score = fast_auc(watch_y_true[:, i], watch_y_pred[:, i])
    auc_list.append(score)

y2_auc = sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
weighted_auc = y1_auc * 0.3 + y2_auc * 0.7

print(y1_auc, y2_auc, weighted_auc)
print([round(x, 6) for x in auc_list])

cls: 9 658026 658026
cls: 8 811075 329013
cls: 7 1053710 263211
cls: 6 1232419 197408
cls: 5 1280693 131606
cls: 4 1133459 65803
cls: 3 1092163 65803
cls: 2 976081 65803
cls: 1 716645 65803
0.8539189728196664 2.506893618079534 2.0110012245015736
[0.510954, 0.505769, 0.502948, 0.501141, 0.504008, 0.509943, 0.519241, 0.551382, 0.711955]
Wall time: 15.7 s


In [None]:
0.8539189728196664 2.506733029424714 2.0108888124431994
[0.510968, 0.505916, 0.502884, 0.500874, 0.504205, 0.509988, 0.519845, 0.551694, 0.710996]
np.array([650000] * 9) * np.array([1.00, 0.5, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1])

0.8539189728196664 2.5071290226271907 2.0111660076849334
[0.511009, 0.505776, 0.502827, 0.501166, 0.503766, 0.510121, 0.519232, 0.551499, 0.712157]
np.array([660000] * 9) * np.array([1.00, 0.5, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1])

In [64]:
roc_auc_score(watch_y_true[:, 9], valid_14['watch_label_pred_9'])

0.8295073418131829

In [68]:
roc_auc_score(y_true[:, 9], valid_14['watch_label_pred_9'])

0.8275152540555288

In [207]:
fast_auc(y_true[:, 9], tmp)

0.6888267735005451

In [206]:
idxes = np.argsort(valid_14['watch_label_pred_9'].values)[::-1][:500000]
# print(valid_14['watch_label_pred_9'].values[idxes])
tmp = np.zeros(y_true.shape[0])
tmp[idxes] = 1

In [84]:
pd.Series(tmp).value_counts()

0.0    5529466
1.0        170
dtype: int64

In [58]:
# watch_rate = np.array([0.972935398825, 0.006935353601, 0.003910049047, 0.002728063657, 0.002145601016, 
#                        0.00177880621, 0.001556090139, 0.001463139979, 0.001724308664, 0.004823188861])
watch_rate = np.round(watch_rate * valid_14.shape[0], 0)
watch_rate = watch_rate.astype(int)

watch_rate = watch_rate[::-1][:-1]
print(watch_rate)

[26670  9535  8091  8605  9836 11864 15085 21621 38350]


In [353]:
watch_label = np.array([750000] * 9) * np.array([1.00, 0.5, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1])
watch_label

array([750000., 450000., 375000., 300000., 225000., 150000.,  75000.,
        75000.,  75000.])

In [354]:
%%time

y_pred = np.zeros(valid_14.shape[0])
idx_vis = set()
for i, n in enumerate(watch_label):
    print('watch_label_pred_{}'.format(9-i))
    idxes = np.argsort(valid_14['watch_label_pred_{}'.format(9-i)]).values
    idxes = idxes[::-1]
    idx = 0
    cnt = 0
    while cnt < n:
        if idxes[idx] not in idx_vis:
            y_pred[idxes[idx]] = 9 - i
            cnt += 1
            idx_vis.add(idxes[idx])
        idx += 1
    print(9-i, idx, cnt)
    
valid_14['watch_label_pred'] = y_pred
y_pred

watch_label_pred_9
9 750000 750000
watch_label_pred_8
8 1057698 450000
watch_label_pred_7
7 1428253 375000
watch_label_pred_6
6 1716645 300000
watch_label_pred_5
5 1889569 225000
watch_label_pred_4
4 1944265 150000
watch_label_pred_3
3 1843214 75000
watch_label_pred_2
2 1644953 75000
watch_label_pred_1
1 1216586 75000
Wall time: 14.9 s


In [356]:
%%time

def onehot_encode(nums, k):
    res = np.zeros((len(nums), k))
    for i, x in tqdm(enumerate(nums)):
        res[i, int(x)] = 1
    res = res.astype(np.int16)
    return res

# y_true = onehot_encode(valid_14['watch_label'].values, 10)
y_pred = onehot_encode(valid_14['watch_label_pred'].values, 10)

auc_list = []
for i in range(1, 10):
    score = fast_auc(y_true[:, i], y_pred[:, i])
    auc_list.append(score)


y2_auc = sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))

print(y1_auc, y2_auc, 0.3 * y1_auc + 0.7 * y2_auc)
print(np.round(auc_list, 4))

5529636it [00:02, 2021003.58it/s]


0.851171913134093 2.5016789444737078 2.0065268350718233
[0.5075 0.5026 0.4985 0.4991 0.4988 0.5049 0.5148 0.5525 0.7183]
Wall time: 4.97 s


In [None]:
0.851171913134093 2.5046285712645817 2.008591573825435
[0.5092 0.5036 0.5009 0.5025 0.5037 0.5122 0.5295 0.5736 0.6811]

0.851171913134093 2.5048668127399663 2.0087583428582043
[0.5096 0.5039 0.501  0.5015 0.5022 0.5084 0.5187 0.5635 0.7024]

0.851171913134093 2.505009505003932 2.00885822744298
[0.5113 0.506  0.5029 0.5025 0.5035 0.5095 0.5254 0.5695 0.6888]

0.851171913134093 2.5047284053746037 2.00866145770245
[0.5111 0.5048 0.5018 0.5016 0.5031 0.5121 0.5265 0.5678 0.6888]


0.851171913134093 2.505281761448482 2.009048806954165
[0.5089 0.5037 0.5001 0.5016 0.501  0.5076 0.5134 0.5477 0.7226]

0.851171913134093 2.5064220594257316 2.0098470155382397
[0.5101 0.5043 0.5015 0.501  0.5026 0.5085 0.5159 0.5495 0.7183]

In [194]:
valid_14.shape

(5529636, 100)

In [195]:
submit

Unnamed: 0,userid,feedid,watch_label_pred_1,watch_label_pred_2,watch_label_pred_3,watch_label_pred_4,watch_label_pred_5,watch_label_pred_6,watch_label_pred_7,watch_label_pred_8,watch_label_pred_9
0,1688013,32645,0.041565,0.041517,0.030585,0.027136,0.015797,0.011883,0.012303,0.011018,0.008197
1,4502598,41270,0.018992,0.010146,0.007159,0.003494,0.003189,0.003902,0.002678,0.003397,0.013966
2,5585629,16345,0.029463,0.029721,0.028063,0.025890,0.016546,0.012276,0.013646,0.015225,0.043648
3,1635520,28149,0.110297,0.030300,0.013785,0.009262,0.004985,0.005673,0.004008,0.003359,0.004886
4,4160191,40554,0.009025,0.005172,0.003592,0.003160,0.002245,0.002353,0.001976,0.008813,0.008634
...,...,...,...,...,...,...,...,...,...,...,...
2822175,5019057,18766,0.031938,0.009035,0.006516,0.005338,0.007723,0.004180,0.003425,0.004030,0.013048
2822176,5019057,12968,0.041120,0.010992,0.016624,0.005029,0.007728,0.004238,0.004196,0.004611,0.018988
2822177,4255762,21794,0.088459,0.084583,0.091509,0.095091,0.074704,0.140358,0.030148,0.027044,0.064275
2822178,171497,21578,0.015035,0.007645,0.006633,0.004905,0.005170,0.004669,0.004579,0.005178,0.101693


## submit后处理

In [290]:
watch_label = np.array([500000] * 9) * np.array([1.00, 0.9, 0.75, 0.35, 0.25, 0.15, 0.15, 0.15, 0.15]) * 0.51
watch_label

array([255000., 229500., 191250.,  89250.,  63750.,  38250.,  38250.,
        38250.,  38250.])

In [291]:
%%time

y_pred = np.zeros(submit.shape[0])
idx_vis = set()
for i, n in enumerate(watch_label):
    print('watch_label_pred_{}'.format(9-i))
    idxes = np.argsort(submit['watch_label_pred_{}'.format(9-i)]).values
    idxes = idxes[::-1]
    idx = 0
    cnt = 0
    while cnt < n:
        if idxes[idx] not in idx_vis:
            y_pred[idxes[idx]] = 9 - i
            cnt += 1
            idx_vis.add(idxes[idx])
        idx += 1
    print(9-i, idx, cnt)
    
submit['watch_label_pred'] = y_pred
y_pred

watch_label_pred_9
9 255000 255000
watch_label_pred_8
8 422002 229500
watch_label_pred_7
7 595639 191250
watch_label_pred_6
6 617118 89250
watch_label_pred_5
5 631209 63750
watch_label_pred_4
4 583606 38250
watch_label_pred_3
3 564253 38250
watch_label_pred_2
2 470477 38250
watch_label_pred_1
1 286955 38250
Wall time: 5.68 s


In [297]:
pd.Series(y_pred).value_counts()

0.0    1840430
9.0     255000
8.0     229500
7.0     191250
6.0      89250
5.0      63750
1.0      38250
4.0      38250
3.0      38250
2.0      38250
dtype: int64

In [301]:
submit_final = pd.read_csv("data/submit/submission.csv")
submit_final['watch_label'] = y_pred
submit_final['watch_label'] = submit_final['watch_label'].astype(int)

In [303]:
submit_final.to_csv("data/submit/submission.csv", index=None)