In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
import lightgbm as lgb
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold 
import gc
import random
import time
import os
import pickle
from utils import reduce_mem, uAUC, ProNE, HyperParam, get_logger, fast_auc

pd.set_option('display.max_columns', None)

In [2]:
valid_pred_lgb1 = pd.read_feather("../data/submit/temp/lgb_pred_file/valid_pred_lgb_1.feather")
valid_pred_lgb2 = pd.read_feather("../data/submit/temp/lgb_pred_file/valid_pred_lgb_2.feather")
valid_pred_nn = pd.read_feather("../data/submit/valid_pred_nn_1.feather")
print(valid_pred_lgb1.shape, valid_pred_lgb2.shape, valid_pred_nn.shape)

test_pred_lgb1 = pd.read_feather("../data/submit/temp/lgb_pred_file/test_pred_lgb_1.feather")
test_pred_lgb2 = pd.read_feather("../data/submit/temp/lgb_pred_file/test_pred_lgb_2.feather")
test_pred_nn = pd.read_feather("../data/submit/test_pred_nn_1.feather")
print(test_pred_lgb1.shape, test_pred_lgb2.shape, test_pred_nn.shape)

(5529636, 14) (5529636, 14) (5529636, 14)
(2822180, 12) (2822180, 12) (2822180, 12)


In [3]:
valid_pred_lgb = valid_pred_lgb1.copy()
test_pred_lgb = test_pred_lgb1.copy()
for col in ['is_share_pred',
        'watch_label_pred_1', 'watch_label_pred_2', 'watch_label_pred_3',
        'watch_label_pred_4', 'watch_label_pred_5', 'watch_label_pred_6',
        'watch_label_pred_7', 'watch_label_pred_8', 'watch_label_pred_9']:
    valid_pred_lgb[col] = valid_pred_lgb1[col] * 0.3 + valid_pred_lgb2[col] * 0.7
    test_pred_lgb[col] = test_pred_lgb1[col] * 0.3 + test_pred_lgb2[col] * 0.7

In [4]:
def onehot_encode(nums, k):
    res = np.zeros((len(nums), k))
    for i, x in (enumerate(nums)):
        res[i, int(x)] = 1
    res = res.astype(np.int32)
    return res

watch_y_true = onehot_encode(valid_pred_lgb1['watch_label'].values, 10)

def get_all_auc(df):
    auc_list = [roc_auc_score(df['is_share'].values, df['is_share_pred'].values)]
    for i in range(1, 10):
        auc_list.append(fast_auc(watch_y_true[:, i], df['watch_label_pred_{}'.format(i)]))
    return list(np.round(auc_list, 6))

print(get_all_auc(valid_pred_lgb1))
print(get_all_auc(valid_pred_lgb2))
print(get_all_auc(valid_pred_lgb))
print(get_all_auc(valid_pred_nn))

[0.861784, 0.819352, 0.828006, 0.831072, 0.83361, 0.834384, 0.836793, 0.835352, 0.843117, 0.854872]
[0.864865, 0.819446, 0.829088, 0.832254, 0.834484, 0.834519, 0.836179, 0.835452, 0.842956, 0.855551]
[0.865041, 0.820255, 0.829681, 0.833031, 0.835437, 0.835762, 0.837607, 0.836836, 0.84427, 0.856045]
[0.848253, 0.822733, 0.836269, 0.840908, 0.84335, 0.846115, 0.84586, 0.847088, 0.850502, 0.858048]


In [5]:
valid_pred_ronghe = valid_pred_lgb.copy()
test_pred_ronghe = test_pred_lgb.copy()

for col in ['is_share_pred',
        'watch_label_pred_1', 'watch_label_pred_2', 'watch_label_pred_3',
        'watch_label_pred_4', 'watch_label_pred_5', 'watch_label_pred_6',
        'watch_label_pred_7', 'watch_label_pred_8', 'watch_label_pred_9']:
    if col == 'is_share_pred':
        valid_pred_ronghe[col] = valid_pred_nn[col] * 0.15 + valid_pred_lgb[col] * 0.85
        test_pred_ronghe[col] = test_pred_nn[col] * 0.15 + test_pred_lgb[col] * 0.85
    else:
        valid_pred_ronghe[col] = valid_pred_nn[col] * 0.6 + valid_pred_lgb[col] * 0.4
        test_pred_ronghe[col] = test_pred_nn[col] * 0.6 + test_pred_lgb[col] * 0.4
    
auc_res = get_all_auc(valid_pred_ronghe)
print(auc_res)
print(np.sum(auc_res[1:] * np.array([1, 2, 3, 4, 4, 4, 4, 4, 4]) / 30))
# [0.86538, 0.824749, 0.835458, 0.839549, 0.842635, 0.843386, 0.844963, 0.844947, 0.851841, 0.86071]
# [0.865656, 0.825691, 0.838127, 0.842723, 0.845767, 0.847418, 0.848257, 0.848998, 0.854561, 0.861673]
# 0.8485603333333334

[0.865656, 0.826177, 0.838215, 0.842787, 0.84575, 0.847206, 0.848186, 0.848697, 0.854721, 0.862186]
0.8485984


In [10]:
'''
def get_deal_y_pred(valid_14):
    nums = valid_14.shape[0]    # 数据集大小
    ## 每类的取值个数 [9,8,7,6,5,4,3,2,1]
    class_nums = np.array([int(nums*0.035)]*9) * np.array([5.0, 3.0, 1.3, 1.2, 1.1, 2.0, 1.0, 1.0, 1.5])
    ## 需要返回的结果
    y_pred_res = np.zeros(nums)
    
    cls_2_fws = []
    for fwd in range(995, 949, -5):
        fwd = fwd / 10
        cls_2_fws.append({i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], fwd) for i in range(1, 10)})
        
    
#     cls_2_996fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99.6) 
#                    for i in range(1, 10)}   # 每个类对应的99分位数  概率更大
    
#     cls_2_99fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99) 
#                    for i in range(1, 10)}   # 每个类对应的99分位数  概率更大
    
#     cls_2_97fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 97) 
#                    for i in range(1, 10)}   # 每个类对应的97分位数  概率更小
    
#     cls_2_90fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 95) 
#                    for i in range(1, 10)}   # 每个类对应的90分位数  概率更小
    
    idx2cls_prob = {}   ## 已经访问过的记录： index -> (cls, prob)
    
    for i, n in enumerate(class_nums):
        probs = valid_14['watch_label_pred_{}'.format(9-i)]   # 概率，最先是第9类
        idxes = np.argsort(-1 * probs).values
        
        idx = 0   # 开始遍历的索引的位置，需要赋值到答案中
        cnt = 0   # 总共遍历得到的个数
        while cnt < n:
            if (idxes[idx] not in idx2cls_prob):
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            else:
                flag = False
                for rate, cls_2_xxfws in enumerate(cls_2_fws):
                    rate = nums * 0.005 * (rate + 1)
                    if  cnt <= rate and i <= 5 and idx2cls_prob[idxes[idx]][1] <= cls_2_xxfws[idx2cls_prob[idxes[idx]][0]]:
                        flag = True
                        break
                if flag == False:
                    if (cnt <= (nums * 0.003) and i > 5 and idx2cls_prob[idxes[idx]][1] <= cls_2_fws[-1][idx2cls_prob[idxes[idx]][0]]):
                        flag = True
                if flag:
                    y_pred_res[idxes[idx]] = 9 - i
                    cnt += 1
                    idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            idx += 1
        print('cls:', 9-i, idx, cnt)
    print(pd.Series(y_pred_res).value_counts())
    return y_pred_res
'''


def get_deal_y_pred(valid_14):
    nums = valid_14.shape[0]    # 数据集大小
    ## 每类的取值个数 [9,8,7,6,5,4,3,2,1]
    class_nums = np.array([int(nums*0.035)]*9) * np.array([3.5, 1.5, 1.4, 1.3, 1.2, 1.5, 1.0, 1.0, 1.5])
    ## 需要返回的结果
    y_pred_res = np.zeros(nums)
    
    cls_2_998fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99.9) 
                   for i in range(1, 10)}   # 每个类对应的998分位数  概率更大
    
    cls_2_996fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99.7) 
                   for i in range(1, 10)}   # 每个类对应的996分位数  概率更大
    
    cls_2_99fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99) 
                   for i in range(1, 10)}   # 每个类对应的99分位数  概率更大
    
    cls_2_97fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 97) 
                   for i in range(1, 10)}   # 每个类对应的97分位数  概率更小
    
    cls_2_95fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 95) 
                   for i in range(1, 10)}   # 每个类对应的95分位数  概率更小
    
    cls_2_90fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 90) 
                   for i in range(1, 10)}   # 每个类对应的90分位数  概率更小
    
    idx2cls_prob = {}   ## 已经访问过的记录： index -> (cls, prob)
    
    for i, n in enumerate(class_nums):
        probs = valid_14['watch_label_pred_{}'.format(9-i)]   # 概率，最先是第9类
        idxes = np.argsort(-1 * probs).values
        
        idx = 0   # 开始遍历的索引的位置，需要赋值到答案中
        cnt = 0   # 总共遍历得到的个数
        while cnt < n:
            if idxes[idx] not in idx2cls_prob:
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            else:
                flag = False
                prob = idx2cls_prob[idxes[idx]][1]
                if cnt <= (nums * 0.002) and i <= 5 and  prob<=cls_2_998fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.005) and i <= 5 and  prob<=cls_2_996fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.01) and i <= 5 and prob <= cls_2_99fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.02) and i <= 5 and prob <= cls_2_97fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.001) and i > 5 and prob <= cls_2_97fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.003) and i > 5 and prob <= cls_2_95fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                elif cnt <= (nums * 0.01) and i > 5 and prob <= cls_2_90fws[idx2cls_prob[idxes[idx]][0]]:
                    flag = True
                if flag:
                    y_pred_res[idxes[idx]] = 9 - i
                    cnt += 1
                    idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            idx += 1
        print('cls:', 9-i, idx, cnt)
    print(pd.Series(y_pred_res).value_counts())
    return y_pred_res



def calc_weighted_auc(valid_14):
    ## 计算 watch_label的AUC
    valid_14['watch_label_pred'] = get_deal_y_pred(valid_14)
    watch_y_pred = onehot_encode(valid_14['watch_label_pred'].values, 10)
    auc_list = []
    for i in range(1, 10):
        score = fast_auc(watch_y_true[:, i], watch_y_pred[:, i])
        auc_list.append(score)
    
    y2_auc = sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]))
    y1_auc = roc_auc_score(valid_14['is_share'], valid_14['is_share_pred'])
    weighted_auc = y1_auc * 0.3 + y2_auc * 0.7
    return weighted_auc, list(np.round([y1_auc] + auc_list, 4))

In [11]:
%%time 
print(calc_weighted_auc(valid_pred_ronghe))

cls: 9 677380 677380
cls: 8 768006 290306
cls: 7 902532 270952
cls: 6 1043797 251599
cls: 5 1169628 232245
cls: 4 1366409 290306
cls: 3 1496039 193537
cls: 2 1593811 193537
cls: 1 1796255 290306
0.0    3537404
9.0     469705
1.0     290306
4.0     269760
2.0     181003
3.0     176744
6.0     157010
8.0     153561
5.0     148196
7.0     145947
dtype: int64
(1.4228847768721944, [0.8657, 0.5259, 0.519, 0.5157, 0.5672, 0.5276, 0.5316, 0.5341, 0.5543, 0.6618])
CPU times: user 46.6 s, sys: 677 ms, total: 47.3 s
Wall time: 47.5 s


In [315]:
valid_pred_ronghe.shape[0] * 0.005

27648.18

In [None]:
(1.4253379689399241, [0.8657, 0.5249, 0.5169, 0.5136, 0.5625, 0.5373, 0.5373, 0.5404, 0.5549, 0.6558])

(1.4249067653760719, [0.8657, 0.5239, 0.5159, 0.5111, 0.5635, 0.5373, 0.5373, 0.5404, 0.5549, 0.6558])

(1.424825557077972, [0.8657, 0.5241, 0.5162, 0.5113, 0.5637, 0.5372, 0.5375, 0.5404, 0.5547, 0.6551])

(1.4246763646425489, [0.8657, 0.5203, 0.5138, 0.5093, 0.564, 0.5377, 0.5378, 0.5411, 0.5551, 0.6561])

(1.424578272556965, [0.8657, 0.5202, 0.5139, 0.5086, 0.564, 0.5377, 0.5378, 0.541, 0.5551, 0.6562])

(1.4245673257440807, [0.8657, 0.5203, 0.514, 0.5086, 0.5623, 0.5375, 0.5382, 0.5409, 0.5557, 0.6572])

(1.4244797028995602, [0.8657, 0.5203, 0.5138, 0.5093, 0.5609, 0.5375, 0.5385, 0.5408, 0.5558, 0.6576])

(1.424342000493192, [0.8657, 0.5188, 0.5116, 0.5076, 0.563, 0.538, 0.5385, 0.5404, 0.5559, 0.6575])

(1.4239698802085936, [0.8657, 0.5188, 0.5116, 0.5076, 0.5618, 0.5372, 0.5387, 0.54, 0.5564, 0.6578])

(1.4238246200049947, [0.8657, 0.5188, 0.5116, 0.5076, 0.5595, 0.5369, 0.5395, 0.5398, 0.5566, 0.6591]

(1.423768894926328, [0.8657, 0.5191, 0.5117, 0.5072, 0.5605, 0.5363, 0.54, 0.5385, 0.557, 0.6591])

(1.4234753063001597, [0.8657, 0.519, 0.5115, 0.5071, 0.5567, 0.537, 0.5388, 0.5382, 0.5574, 0.6625])

In [None]:
(1.4071755274530888, [0.865, 0.5108, 0.5051, 0.5052, 0.513, 0.517, 0.5247, 0.5371, 0.5608, 0.6871])
[2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.6]

In [200]:
valid_pred_lgb.shape

(5529636, 15)

In [136]:
roc_auc_score(valid_pred_lgb['is_share'], 
              valid_pred_lgb['is_share_pred'] * 0.9 + valid_pred_nn['is_share_pred'] * 0.1)

0.8653532609744193

In [252]:
tmp = np.argsort(-1 * valid_pred_ronghe['watch_label_pred_8']).values
tmp

array([2465512, 4178718, 1170852, ...,  404309, 5087299, 3995671])

In [258]:
valid_pred_lgb['watch_label_pred_9'][tmp[int(tmp.shape[0] * 0.005)]]

0.061238400665924175

In [None]:
auc = 1.4042242245680665
auc list = [0.8627, 0.5115, 0.5052, 0.5047, 0.5117, 0.517, 0.5249, 0.5355, 0.5595, 0.683]
watch_rate = np.array([int(ds * 0.03)] * 9) 
watch_rate = watch_rate * np.array([2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0])

In [267]:
submit_final

Unnamed: 0,user_id,video_id,watch_label,is_share
0,1688013,32645,0,0.000448
1,4502598,41270,0,0.005435
2,5585629,16345,4,0.000089
3,1635520,28149,2,0.000824
4,4160191,40554,0,0.000216
...,...,...,...,...
2822175,5019057,18766,0,0.000534
2822176,5019057,12968,0,0.001657
2822177,4255762,21794,6,0.000274
2822178,171497,21578,0,0.000096


In [264]:
test_pred_ronghe

Unnamed: 0,userid,feedid,is_share_pred,watch_label_pred_1,watch_label_pred_2,watch_label_pred_3,watch_label_pred_4,watch_label_pred_5,watch_label_pred_6,watch_label_pred_7,watch_label_pred_8,watch_label_pred_9,watch_label
0,1688013,32645,0.000447,0.033794,0.025580,0.016646,0.012379,0.016207,0.008363,0.016327,0.005603,0.011711,0.0
1,4502598,41270,0.005559,0.015746,0.011756,0.009328,0.005713,0.003914,0.004203,0.002426,0.004887,0.013072,0.0
2,5585629,16345,0.000098,0.035145,0.036693,0.032595,0.027493,0.020886,0.015152,0.015899,0.014769,0.050129,5.0
3,1635520,28149,0.000553,0.107687,0.042827,0.025364,0.010008,0.007457,0.005436,0.005902,0.004992,0.004729,3.0
4,4160191,40554,0.000259,0.008871,0.008151,0.003503,0.005704,0.003078,0.002982,0.001485,0.001810,0.006197,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2822175,5019057,18766,0.000677,0.021451,0.007275,0.003553,0.003701,0.002729,0.002421,0.002137,0.002051,0.006627,0.0
2822176,5019057,12968,0.001717,0.027354,0.010249,0.006614,0.003875,0.003941,0.003396,0.002627,0.002448,0.009732,0.0
2822177,4255762,21794,0.000264,0.053778,0.055915,0.061187,0.057394,0.024828,0.024003,0.017203,0.018602,0.040693,4.0
2822178,171497,21578,0.000089,0.016492,0.008509,0.007108,0.005001,0.004602,0.003731,0.003642,0.004073,0.013245,0.0


In [389]:
test_pred_ronghe['watch_label'] = get_deal_y_pred(test_pred_ronghe)
submit_final = test_pred_ronghe[['userid', 'feedid', 'watch_label', 'is_share_pred']]
submit_final.columns = ['user_id', 'video_id', 'watch_label', 'is_share']
submit_final['is_share'] = np.round(submit_final['is_share'], 8)
submit_final['watch_label'] = submit_final['watch_label'].astype(int)

submit_final.to_csv("submission.csv", index=None)

cls: 9 345716 345716
cls: 8 307461 138287
cls: 7 357938 128409
cls: 6 413028 118532
cls: 5 450594 108654
cls: 4 586456 148164
cls: 3 675269 98776
cls: 2 738194 98776
cls: 1 842284 148164
0.0    1768365
9.0     241389
1.0     148164
4.0     139494
2.0      95849
3.0      94596
5.0      85235
6.0      85127
8.0      83663
7.0      80298
dtype: int64


In [61]:
valid_pred_lgb.to_feather("../data/submit/valid_pred_lgb.feather")
test_pred_lgb.to_feather("../data/submit/test_pred_lgb.feather")

## 验证每个类的AUC

In [57]:
valid_pred_nn = pd.read_feather("../data/submit/valid_pred_nn.feather")
print(calc_weighted_auc(valid_pred_nn))

cls: 9 331778 331778
cls: 8 471105 165889
cls: 7 631372 165889
cls: 6 799774 165889
cls: 5 973767 165889
cls: 4 1317714 331778
cls: 3 1483450 165889
cls: 2 1638730 165889
cls: 1 1713458 165889
(1.389676254563362, [0.8463, 0.5038, 0.502, 0.5014, 0.5109, 0.5112, 0.5168, 0.5315, 0.5573, 0.6756])


In [58]:
for i in range(1, 10):
    print(i, fast_auc(watch_y_true[:, i], valid_pred_lgb['watch_label_pred_{}'.format(i)]))

1 0.8204684312658791
2 0.8312688231428056
3 0.8358106167371372
4 0.8388184166027295
5 0.8398017460547668
6 0.840805452552308
7 0.8412869127286081
8 0.8474089199720032
9 0.854785566250586


In [59]:
for i in range(1, 10):
    print(i, fast_auc(watch_y_true[:, i], valid_pred_nn['watch_label_pred_{}'.format(i)]))

1 0.8176254325139984
2 0.8315665826518253
3 0.8365536768507117
4 0.8392822962601287
5 0.8414716028055667
6 0.8407906219314308
7 0.8428801426873801
8 0.8461030032704183
9 0.8516852426035216


In [61]:
auc_list = []
for i in range(1, 10):
    nn_pred = valid_pred_nn['watch_label_pred_{}'.format(i)]
    lgb_pred = valid_pred_lgb['watch_label_pred_{}'.format(i)]
    score = fast_auc(watch_y_true[:, i], nn_pred * 0.4 + lgb_pred * 0.6)
    auc_list.append(score)
print(auc_list)
print(sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4])) / 3.0)

# 0.8403756136922859
# 0.8413960074774893

[0.8232599399817404, 0.835214017588494, 0.8402376088336702, 0.8431899522427675, 0.8446730511752154, 0.8453345267917939, 0.8462283796749352, 0.8519437201502028, 0.8581133390859573]
0.8457444226047744


In [36]:
auc_list = [0.7923662967956898,0.807365790512706,0.8124565195308958, 
            0.8149153442850553,0.8158433717305262, 0.8150350317200666,
            0.8164819177881378, 0.8203396688181431, 0.8261589965286208]
wauc = sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]))
wauc / 3.0

0.8159854919965331

In [37]:
auc_list = [0.8161075033448782,
 0.825195625654561,
0.8287671623155689,
 0.830859231649306,
 0.832416803626909,
 0.834209622234266,
 0.8340127054655736,
 0.8413445439089122,
 0.851749874869514]
wauc = sum(np.array(auc_list) * np.array([0.1, 0.2, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]))
wauc / 3.0

0.8350390456206211

In [45]:
valid_pred_lgb

Unnamed: 0,userid,feedid,is_share,watch_label,is_share_pred,watch_label_pred_1,watch_label_pred_2,watch_label_pred_3,watch_label_pred_4,watch_label_pred_5,watch_label_pred_6,watch_label_pred_7,watch_label_pred_8,watch_label_pred_9,watch_label_pred
0,4866878,1759,0.0,0.0,0.000418,0.012677,0.002941,0.001699,0.001354,0.001237,0.001015,0.000950,0.001164,0.002783,0.0
1,4866878,25144,0.0,0.0,0.000911,0.029733,0.008756,0.006393,0.005491,0.003290,0.002214,0.002567,0.002842,0.004891,0.0
2,4866878,6693,0.0,0.0,0.001121,0.005774,0.001828,0.001286,0.001226,0.000919,0.000826,0.000863,0.000981,0.001596,0.0
3,951097,12968,0.0,0.0,0.003867,0.025786,0.018319,0.013750,0.009624,0.007628,0.007235,0.006858,0.007319,0.037426,0.0
4,951097,3636,0.0,0.0,0.006019,0.010155,0.002680,0.001626,0.001112,0.000905,0.000905,0.000656,0.000991,0.002908,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5529631,3824966,29998,0.0,0.0,0.000299,0.055854,0.037241,0.028434,0.022137,0.020169,0.017898,0.015856,0.024097,0.032193,6.0
5529632,3824966,26273,0.0,0.0,0.000109,0.020432,0.014438,0.010923,0.007143,0.007135,0.004888,0.004258,0.005956,0.011829,0.0
5529633,1687377,17585,0.0,0.0,0.037426,0.030489,0.009203,0.005521,0.003650,0.003132,0.002562,0.002213,0.002848,0.009592,0.0
5529634,1286468,41270,0.0,0.0,0.005920,0.032904,0.021192,0.014144,0.009696,0.008966,0.007898,0.007117,0.008542,0.024772,2.0


In [53]:
valid_pred_nn['watch_label_pred_9'].mean()

0.02128458209335804

In [54]:
valid_pred_lgb['watch_label_pred_9'].mean()

0.022054674928990547

In [55]:
valid_pred_nn['watch_label_pred_9'].std()

0.04068950191140175

In [56]:
valid_pred_lgb['watch_label_pred_9'].std()

0.03279500886956574

In [282]:

def get_deal_y_pred(valid_14):
    nums = valid_14.shape[0]    # 数据集大小
    ## 每类的取值个数 [9,8,7,6,5,4,3,2,1]
    class_nums = np.array([int(nums*0.035)]*9) * np.array([3.5, 1.4, 1.3, 1.2, 1.1, 2.0, 1.0, 1.0, 1.5])
    ## 需要返回的结果
    y_pred_res = np.zeros(nums)
    
    cls_2_996fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99.6) 
                   for i in range(1, 10)}   # 每个类对应的99分位数  概率更大
    
    cls_2_99fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 99) 
                   for i in range(1, 10)}   # 每个类对应的99分位数  概率更大
    
    cls_2_97fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 97) 
                   for i in range(1, 10)}   # 每个类对应的97分位数  概率更小
    
    cls_2_90fws = {i: np.percentile(valid_14['watch_label_pred_{}'.format(i)], 95) 
                   for i in range(1, 10)}   # 每个类对应的90分位数  概率更小
    
    idx2cls_prob = {}   ## 已经访问过的记录： index -> (cls, prob)
    
    for i, n in enumerate(class_nums):
        probs = valid_14['watch_label_pred_{}'.format(9-i)]   # 概率，最先是第9类
        idxes = np.argsort(-1 * probs).values
        
        idx = 0   # 开始遍历的索引的位置，需要赋值到答案中
        cnt = 0   # 总共遍历得到的个数
        while cnt < n:
            if (idxes[idx] not in idx2cls_prob) or (cnt <= (nums * 0.005) and i <= 5 and 
                                                    idx2cls_prob[idxes[idx]][1] <= cls_2_996fws[idx2cls_prob[idxes[idx]][0]]):
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            elif (idxes[idx] not in idx2cls_prob) or (cnt <= (nums * 0.01) and i <= 5 and 
                                                    idx2cls_prob[idxes[idx]][1] <= cls_2_99fws[idx2cls_prob[idxes[idx]][0]]):
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            elif (idxes[idx] not in idx2cls_prob) or ((nums * 0.01) <= cnt and cnt <= (nums * 0.02) and i <= 5 and 
                                                    idx2cls_prob[idxes[idx]][1] <= cls_2_97fws[idx2cls_prob[idxes[idx]][0]]):
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
                
            elif (idxes[idx] not in idx2cls_prob) or (cnt <= (nums * 0.003) and i > 5 and 
                                                    idx2cls_prob[idxes[idx]][1] <= cls_2_90fws[idx2cls_prob[idxes[idx]][0]]):
                y_pred_res[idxes[idx]] = 9 - i
                cnt += 1
                idx2cls_prob[idxes[idx]] = (9-i, probs[idxes[idx]])
            idx += 1
        print('cls:', 9-i, idx, cnt)
    print(pd.Series(y_pred_res).value_counts())
    return y_pred_res