In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold 
import gc
import random
import time
import os
import pickle
from utils import reduce_mem, uAUC, ProNE, HyperParam, get_logger

pd.set_option('display.max_columns', None)

In [2]:
%%time
df = pd.read_feather("data/features/df_v1.feather")
print(df.shape)

df = df[df['date_'] != 1]
submit = df[df['date_'] == 15][['userid', 'feedid']]
print(df.shape, submit.shape)
df.head()

(34321552, 130)
(32264112, 130) (2822180, 2)
CPU times: user 40.1 s, sys: 37.9 s, total: 1min 18s
Wall time: 46.2 s


In [3]:
## merge embedding特征
fid_w2v_emb = pd.read_pickle("data/features/fid_w2v_emb.pkl")
fid_tfidf_svd_emb = pd.read_pickle("data/features/fid_tfidf_svd_emb.pkl")
uid_tfidf_svd_emb = pd.read_pickle("data/features/uid_tfidf_svd_emb.pkl")
fid_w2v_emb = reduce_mem(fid_w2v_emb)
fid_tfidf_svd_emb = reduce_mem(fid_tfidf_svd_emb)
uid_tfidf_svd_emb = reduce_mem(uid_tfidf_svd_emb)

print(fid_w2v_emb.shape, fid_tfidf_svd_emb.shape, uid_tfidf_svd_emb.shape)

100%|██████████| 33/33 [00:00<00:00, 628.21it/s]
100%|██████████| 33/33 [00:00<00:00, 999.36it/s]
  0%|          | 0/33 [00:00<?, ?it/s]

4.00 Mb, 2.00 Mb (50.00 %)
4.24 Mb, 2.24 Mb (47.22 %)


100%|██████████| 33/33 [00:01<00:00, 27.70it/s]

445.07 Mb, 234.90 Mb (47.22 %)
(30862, 33) (30862, 33) (3240903, 33)





In [6]:
%%time

df = df.merge(fid_w2v_emb, how='left', on=['feedid'])
df = df.merge(fid_tfidf_svd_emb, how='left', on=['feedid'])
print(df.shape)
df = df.merge(uid_tfidf_svd_emb, how='left', on=['userid'])
print(df.shape)

(32264112, 194)
(32264112, 226)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32264112 entries, 0 to 32264111
Columns: 226 entries, userid to userid_feedid_tfidf_svd_31
dtypes: float16(185), float32(15), float64(15), int16(2), int32(3), int8(6)
memory usage: 17.4 GB


In [11]:
df.to_feather("data/features/df_emb_v1.feather")

### 训练集采样

In [2]:
df = pd.read_feather("data/features/df_emb_v1.feather")
print(df.shape)

train = df[(df['date_'] <= 14)].reset_index(drop=True)
test = df[df['date_'] == 15].reset_index(drop=True)
del df
gc.collect()
print(train.shape, test.shape)

train_pos = train[(train['watch_label'] > 0) | (train['is_share'] > 0)]
train_neg = train[(train['watch_label'] ==0) & (train['is_share'] == 0)]
print("正样本个数：{}".format(train_pos.shape))
print("负样本个数：{}".format(train_neg.shape))

(32264112, 226)
(29441932, 226) (2822180, 226)
正样本个数：(2029555, 226)
负样本个数：(27412377, 226)


In [3]:
train_neg = train_neg.sample(n=8000000, random_state=2021).reset_index(drop=True)
train = pd.concat([train_pos, train_neg], ignore_index=True)
train = train.sample(frac=1.0, random_state=2021).reset_index(drop=True)
print(train.shape)

(10029555, 226)


In [5]:
## 切分训练、验证、测试
cate_cols = ['userid', 'feedid', 'age', 'gender', 'country', 'province', 'city', 'city_level', 'device_name']
y_list = ['is_watch', 'is_share', 'is_collect', 'is_comment', 'watch_label']
# train['is_share'] = train['is_share'].astype(np.int32)
# train['watch_label'] = train['watch_label'].astype(np.int32)

# for col in tqdm(cate_cols):
#     lbl = LabelEncoder()
#     df[col] = lbl.fit_transform(df[col])
    
## lgb训练模型所需要的特征列
cols = [f for f in test.columns if (f not in ['date_'] + y_list)]
print("特征总数：{}".format(len(cols)))

特征总数：220


In [6]:
print(train['is_share'].value_counts())
print(train['watch_label'].value_counts())

0.0    10016060
1.0       13495
Name: is_share, dtype: int64
0.0    8010143
1.0     519376
9.0     358426
2.0     292520
3.0     203305
4.0     159738
5.0     132749
8.0     128385
6.0     115711
7.0     109202
Name: watch_label, dtype: int64


In [7]:
train_fea = train[cols].values.astype(np.float32)
test_fea = test[cols].values.astype(np.float32)
train_y_watch = train['watch_label'].values.astype(np.float32)
train_y_share = train['is_share'].values.astype(np.float32)
print(train_fea.shape, train_y_watch.shape, train_y_share.shape, test_fea.shape)

submit = test[['userid', 'feedid']]
submit.columns = ['user_id', 'video_id']
print(submit.shape)

del train, test, train_neg, train_pos
gc.collect()

(10029555, 220) (10029555,) (10029555,) (2822180, 220)
(2822180, 2)


0

## 训练 is_share

In [8]:
clf = LGBMClassifier(
            learning_rate=0.02,
            n_estimators=1000,
            num_leaves=63,
            subsample=0.85,
            colsample_bytree=0.85,
            random_state=2021,
            metric='None',
            n_jobs=4)

N = 8000000
clf.fit(train_fea[:N], train_y_share[:N],
        eval_set=[(train_fea[:N], train_y_share[:N]), (train_fea[N:], train_y_share[N:])],
        eval_names=['train', 'valid'],
        eval_metric='auc',
        early_stopping_rounds=50,
        verbose=50)

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.867213	valid's auc: 0.838654
[100]	train's auc: 0.895724	valid's auc: 0.844968
[150]	train's auc: 0.91772	valid's auc: 0.848076
[200]	train's auc: 0.935507	valid's auc: 0.849549
[250]	train's auc: 0.947584	valid's auc: 0.849986
[300]	train's auc: 0.954952	valid's auc: 0.850736
[350]	train's auc: 0.958668	valid's auc: 0.850899
[400]	train's auc: 0.963528	valid's auc: 0.850984
[450]	train's auc: 0.9669	valid's auc: 0.851213
[500]	train's auc: 0.970365	valid's auc: 0.851539
Early stopping, best iteration is:
[494]	train's auc: 0.969777	valid's auc: 0.851556


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
        importance_type='split', learning_rate=0.02, max_depth=-1,
        metric='None', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=1000, n_jobs=4, num_leaves=63,
        objective=None, random_state=2021, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=0.85, subsample_for_bin=200000,
        subsample_freq=0)

In [9]:
submit['is_share'] = clf.predict_proba(test_fea)[:, 1]
submit['is_share'] = np.round(submit['is_share'], 6)

In [None]:
# valid's auc: 0.846132

## 训练 watch_label 多分类

In [13]:
# clf = LGBMClassifier(
#             learning_rate=0.1,
#             objective='multiclass',
#             n_estimators=2000,
#             num_leaves=63,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             random_state=2021,
#             metric='None',
#             n_jobs=4)
        
# clf.fit(train[train['date_'] != 14][cols], train[train['date_'] != 14]['watch_label'],
#         eval_set=[(train[train['date_'] == 14][cols], train[train['date_'] == 14]['watch_label'])],
#         eval_metric='logloss',
#         early_stopping_rounds=50,
#         verbose=20)

## 训练 watch_label 回归

In [10]:
train_y_watch = np.log2(train_y_watch + 1)

In [11]:
clf = LGBMRegressor(
            learning_rate=0.1,
            n_estimators=2000,
            num_leaves=63,
            subsample=0.85,
            colsample_bytree=0.85,
            random_state=2021,
            metric='None',
            n_jobs=4)


N = 8000000
clf.fit(train_fea[:N], train_y_watch[:N], 
        eval_set=[(train_fea[:N], train_y_watch[:N]), (train_fea[N:], train_y_watch[N:])],
        eval_names=['train', 'valid'],
        eval_metric='l2',
        early_stopping_rounds=50,
        verbose=100)

Training until validation scores don't improve for 50 rounds
[100]	train's l2: 0.737305	valid's l2: 0.739856
[200]	train's l2: 0.728655	valid's l2: 0.732916
[300]	train's l2: 0.724002	valid's l2: 0.730225
[400]	train's l2: 0.7203	valid's l2: 0.728565
[500]	train's l2: 0.716907	valid's l2: 0.727219
[600]	train's l2: 0.714048	valid's l2: 0.726485
[700]	train's l2: 0.711258	valid's l2: 0.725752
[800]	train's l2: 0.708567	valid's l2: 0.725213
[900]	train's l2: 0.705982	valid's l2: 0.724745
[1000]	train's l2: 0.703452	valid's l2: 0.724298
[1100]	train's l2: 0.701014	valid's l2: 0.723876
[1200]	train's l2: 0.69868	valid's l2: 0.723541
[1300]	train's l2: 0.696393	valid's l2: 0.723286
[1400]	train's l2: 0.69421	valid's l2: 0.723108
[1500]	train's l2: 0.692045	valid's l2: 0.722854
[1600]	train's l2: 0.689855	valid's l2: 0.722637
[1700]	train's l2: 0.687686	valid's l2: 0.722483
[1800]	train's l2: 0.68566	valid's l2: 0.722351
[1900]	train's l2: 0.683576	valid's l2: 0.722203
[2000]	train's l2: 0.6

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.85,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       metric='None', min_child_samples=20, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=2000, n_jobs=4, num_leaves=63,
       objective=None, random_state=2021, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=0.85, subsample_for_bin=200000,
       subsample_freq=0)

In [12]:
submit['watch_label'] = clf.predict(test_fea)

In [74]:
submit

Unnamed: 0,user_id,video_id,is_share,watch_label
0,1688013,32645,0.000343,0.351628
1,4502598,41270,0.006045,0.381432
2,5585629,16345,0.000006,0.367150
3,1635520,28149,0.000864,0.622225
4,4160191,40554,0.000602,0.257869
...,...,...,...,...
2822175,5019057,18766,0.000022,0.025173
2822176,5019057,12968,0.000436,-0.059981
2822177,4255762,21794,0.000218,0.629374
2822178,171497,21578,0.000096,0.180052


## 对watch_label进行后处理

In [89]:
train = pd.read_pickle("data/origin/train.pkl")
test = pd.read_pickle("data/origin/test.pkl")
user_info = pd.read_pickle("data/origin/user_info.pkl")
print(train.shape, test.shape, user_info.shape)

(80276856, 8) (2822180, 2) (5910800, 8)


In [49]:
# ## 查看测试集分布是否一致
# train_tmp = train[train['pt_d'] == 20210502]
# train_tmp = train_tmp.merge(user_info, how='left', on='user_id')

# test_tmp = test.merge(user_info, how='left', on='user_id')

# import seaborn as sns
# sns.distplot(test_tmp['age'].value_counts(), bins=10)

# train_tmp['city_level'].value_counts().values / sum(train_tmp['city_level'].value_counts().values)
# test_tmp['city_level'].value_counts().values / sum(test_tmp['city_level'].value_counts().values)

In [90]:
watch_percent = np.zeros(10)
dt_n = len(train['pt_d'].unique()[-7:])
for dt in sorted(train['pt_d'].unique()[-2:]):
    tmp = dict(train[train['pt_d'] == dt]['watch_label'].value_counts())
    tmp = sorted(tmp.items(), key=lambda x: x[0])
    tmp = np.array([x[1] for x in tmp])
    tmp = np.cumsum(tmp)
    tmp = tmp / tmp[-1]
    watch_percent += tmp / 2
    print(tmp[:5])
print(watch_percent)

[0.97519961 0.98136256 0.98485253 0.98733698 0.98931752]
[0.97381726 0.98003124 0.98368735 0.98633183 0.98846199]
[0.97450844 0.9806969  0.98426994 0.9868344  0.98888976 0.99061986
 0.99213893 0.99356421 0.99525122 1.        ]


In [None]:
7->3->2

In [91]:
# watch_percent = np.array([0.9729354 , 0.97987075, 0.9837808 , 0.98650887, 0.98865447,
#                           0.99043327, 0.99198936, 0.9934525 , 0.99517681, 1.        ])
watch_quantile = np.percentile(submit['watch_label'], watch_percent * 100)
def transform_y(x):
    for i in range(len(watch_quantile)):
        if x <= watch_quantile[i]:
            return int(i)
    return i

submit_final = submit[['user_id', 'video_id']]
submit_final['watch_label'] = submit['watch_label'].apply(lambda x: transform_y(x)).astype(int)
submit_final['is_share'] = submit['is_share']

In [92]:
submit_final.to_csv("data/submit/submission.csv", index=None)

In [86]:
pd.read_csv("data/submit/submission.csv")['watch_label'].value_counts()

0    2750548
1      18043
9      12688
2      10300
3       7268
4       5751
5       4822
8       4638
6       4193
7       3929
Name: watch_label, dtype: int64

In [87]:
submit_final['watch_label'].value_counts()

0    2751198
1      17609
9      12922
2      10077
3       7148
4       5685
5       4807
8       4627
6       4182
7       3925
Name: watch_label, dtype: int64

In [82]:
submit_0818.columns = ['user_id', 'video_id', 'watch_label', 'is_share']
submit_0818.to_csv("data/submit/submission.csv", index=None)

NameError: name 'submit_0818' is not defined