In [2]:
import numpy as np
from sklearn import metrics
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from time import time
import pandas as pd
import xgboost as xgb
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, TomekLinks
import datatable as dt
import warnings
import json
import pickle

from tools import *

# 给定预测标签，计算AUC
使用OVR的策略计算每个类别的AUC
过程：
- 选择类别i作为正类，其他类别作为负类
- 将真实标签中不等于i的标记为0，等于i的标记为1
- 将预测标签中不等于i的标记为0，等于ide标记为1
- 计算混淆矩阵
- 计算(fpr, tpr)
- 计算AUC

# 加载数据
训练数据加载过程：
1. 分别加载处理好的用户特征和视频特征，以及整合的用户历史行为数据；
2. 从用户历史行为数据中筛掉在视频特征中没出现过的video_id；
3. 将行为数据中的user_id、video_id替换为对应用户/视频的特征
4. 根据不同的任务划分为`watch_label`、`is_share`的数据集

推断时，类似于上述过程拼接数据。

## 读取数据

In [3]:
base_dir = "../2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

### 基础特征与附加特征合并

In [4]:
video_status = dt.fread(os.path.join(train_data_dir, "video_features_data/video_status.csv"))
user_status = dt.fread(os.path.join(train_data_dir, "user_features_data/user_status.csv"))

In [282]:
tab_user = dt.fread(os.path.join(train_data_dir, "user_features_data/user_features.jay"))
tab_video = dt.fread(os.path.join(train_data_dir, "video_features_data/video_features.jay"))

In [283]:
video_status.key = 'video_id'
video_ws = tab_video[:, :, join(video_status)]

In [286]:
user_status.key = 'user_id'
user_ws = tab_user[:, :, join(user_status)]

In [288]:
video_ws.to_jay(os.path.join(train_data_dir, "video_features_data/video_features_with_status.jay"))

In [289]:
user_ws.to_jay(os.path.join(train_data_dir, "user_features_data/user_features_with_status.jay"))

In [287]:
user_ws

Unnamed: 0_level_0,user_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,gender_0,…,average_watch_label,sum_watch_times,sum_comment_times,sum_collect_times,sum_share_times
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,Unnamed: 11_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1.757e+06,0,0,0,1,0,0,0,0,0,…,0,0,0,0,0
1,17938,1,0,0,0,0,0,0,0,1,…,0.0967742,3,0,0,0
2,4.26352e+06,0,1,0,0,0,0,0,0,1,…,0.204545,2,0,0,0
3,1.4116e+06,0,0,0,1,0,0,0,0,1,…,0,0,0,0,0
4,3.99224e+06,0,0,1,0,0,0,0,0,1,…,0,0,0,0,0
5,4.0116e+06,0,0,1,0,0,0,0,0,1,…,0,0,0,0,0
6,4.78556e+06,0,0,0,0,0,1,0,0,0,…,0,0,0,0,0
7,5.11036e+06,1,0,0,0,0,0,0,0,1,…,0,0,0,0,0
8,1.3212e+06,0,0,0,0,0,1,0,0,0,…,0,0,0,0,0
9,3.20698e+06,1,0,0,0,0,0,0,0,0,…,0,0,0,0,0


### 通过 .npz 读取数据

#### 单表读取后合并

In [20]:
%%time
# 单独读取每个文件再进行合并
user_df = read_npz_to_df(os.path.join(train_data_dir, "user_features_data/user_features.npz"), data_name='features', column_name='columns')
video_df = read_npz_to_df(os.path.join(train_data_dir, "video_features_data/video_features.npz"), data_name='features')
action_df = read_npz_to_df(os.path.join(train_data_dir, "all_actions.npz"), data_name='data')

CPU times: user 3.6 s, sys: 2.4 s, total: 6 s
Wall time: 12.1 s


In [797]:
# 因为将字符串保存到 .npz时会使dtype为object，重新读回DataFrame时各个列的数据类型均为 object，所以先转换类型
dtypes = dict(zip(video_df.columns, [np.float32] * video_df.shape[1]))
dtypes.update({'video_name': np.str})
video_df = video_df.astype(dtypes)

In [802]:
%%time
# 合并各个表
df_train = merge_user_video_action(user_df, video_df, action_df)
df_train.shape

CPU times: user 1min 29s, sys: 5.88 s, total: 1min 35s
Wall time: 40.1 s


(7353024, 76)

In [805]:
np.savez(os.path.join(train_data_dir, "train.npz"), data=df_train.to_pandas().values, columns=df_train.to_pandas().columns.tolist())

In [777]:
%%time
test_df = load_table(os.path.join(test_data_dir, "test.csv"), ftype="csv")

CPU times: user 626 ms, sys: 0 ns, total: 626 ms
Wall time: 721 ms


In [779]:
%%time
df_test = merge_user_video_action(user_df, video_df, test_df)
df_test.shape

CPU times: user 3min 8s, sys: 12.8 s, total: 3min 20s
Wall time: 51.2 s


(2822180, 73)

#### 读取合并好的数据

In [806]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.npz")
df_train = read_npz_to_df(path, data_name='data')
df_train.shape

CPU times: user 3min 2s, sys: 38.4 s, total: 3min 40s
Wall time: 3min 41s


(7353024, 76)

In [810]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.npz")
df_test = read_npz_to_df(path, data_name='data')
df_test.shape

CPU times: user 1min 37s, sys: 36.9 s, total: 2min 14s
Wall time: 5min 49s


(2822180, 73)

### 通过 .jay 文件读取

#### 单表读取后合并

In [4]:
with_status = False
if with_status:
    user_features_name = "user_features_with_status"
    video_features_name = "video_features_with_status"
else:
    user_features_name = "user_features"
    video_features_name = "video_features"
    
p_user = os.path.join(train_data_dir, f"user_features_data/{user_features_name}.jay")
p_video = os.path.join(train_data_dir, f"video_features_data/{video_features_name}.jay")

In [5]:
%%time
## 使用datatable 加载训练数据
p_act = os.path.join(train_data_dir, "all_actions_with_status.jay")

df_train, others = load_train_test_data(None, pre_merged=False, return_others=True,
                           **{"p_user": p_user, "p_video": p_video, "p_action": p_act})
user_df = others['user']
video_df = others['video']
action_df = others['action']
df_train.shape

CPU times: user 26.2 s, sys: 1.35 s, total: 27.5 s
Wall time: 1.42 s


(7353024, 133)

In [6]:
%%time
# p_user = os.path.join(train_data_dir, "user_features_data/user_features.jay")
# p_video = os.path.join(train_data_dir, "video_features_data/video_features.jay")
p_act = os.path.join(test_data_dir, "test_with_status.jay")

#path = os.path.join(test_data_dir, "test.jay")
kwargs = {"p_user": p_user, "p_video": p_video, "p_action": p_act}

df_test, others = load_train_test_data(None, pre_merged=False, return_others=True, **kwargs)
test_df = others['action']
df_test.shape

CPU times: user 21.4 s, sys: 1.05 s, total: 22.4 s
Wall time: 1.16 s


(2822180, 130)

In [7]:
action_df = action_df.to_pandas()
user_df = user_df.to_pandas()
video_df = video_df.to_pandas()

In [8]:
test_df = test_df.to_pandas()

#### 读取合并好后的数据

In [14]:
df_test.to_jay(os.path.join(test_data_dir, "test_with_status.jay"))

In [15]:
df_train.to_jay(os.path.join(train_data_dir, "train_with_status.jay"))

In [468]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.jay")
df_train = load_train_test_data(path, pre_merged=True)
df_train.shape

CPU times: user 0 ns, sys: 177 ms, total: 177 ms
Wall time: 184 ms


(7353024, 76)

In [454]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.jay")
df_test = load_train_test_data(path, pre_merged=True)
df_test.shape

CPU times: user 649 µs, sys: 46 µs, total: 695 µs
Wall time: 684 µs


(2822180, 72)

## 处理训练数据
可在此做一些预处理：
- 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
- 删除多余的列
- 调整列的顺序
- 改变列的数据类型


In [9]:
%%time
if isinstance(df_train, dt.Frame):
    df_train = df_train.to_pandas()
if isinstance(df_test, dt.Frame):
    df_test = df_test.to_pandas()

CPU times: user 26.8 s, sys: 3.57 s, total: 30.3 s
Wall time: 7.55 s


In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Columns: 133 entries, user_id to da_4
dtypes: float32(40), float64(63), int32(20), int64(9), object(1)
memory usage: 5.6+ GB


In [10]:
# 删除 video_name、is_watch 列
df_train.drop(['video_name', 'is_watch'], axis=1, inplace=True)

In [11]:
# 删除 video_id、user_id列
df_train.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [12]:
dataset = df_train
dataset.shape

(7353024, 129)

In [13]:
# 准备数据
watch_label = dataset.pop('watch_label').astype(np.uint8)
is_share = dataset.pop('is_share').astype(np.uint8)
watch_label.shape, is_share.shape, dataset.shape

((7353024,), (7353024,), (7353024, 127))

## 处理测试数据

In [14]:
if 'test_df' not in dir():
    test_df = pd.read_csv(os.path.join(test_data_dir, "test.csv"))

In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 59 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   user_id                int32  
 1   video_id               int32  
 2   v_avg_watch_label_1    float64
 3   v_sum_watch_times_1    float64
 4   v_sum_watch_overs_1    float64
 5   v_sum_comment_times_1  float64
 6   v_sum_collect_times_1  float64
 7   v_sum_share_times_1    float64
 8   v_sum_quit_times_1     float64
 9   v_sum_skip_times_1     float64
 10  v_sum_watch_days_1     float64
 11  v_avg_watch_label_3    float64
 12  v_sum_watch_times_3    float64
 13  v_sum_watch_overs_3    float64
 14  v_sum_comment_times_3  float64
 15  v_sum_collect_times_3  float64
 16  v_sum_share_times_3    float64
 17  v_sum_quit_times_3     float64
 18  v_sum_skip_times_3     float64
 19  v_sum_watch_days_3     float64
 20  v_avg_watch_label_7    float64
 21  v_sum_watch_times_7    float64
 22  v_sum_watch_overs_

In [18]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Columns: 130 entries, user_id to da_4
dtypes: float32(40), float64(83), int32(2), object(5)
memory usage: 2.3+ GB


In [15]:
# 删除 video_name 列
if 'video_name' in df_test.columns:
    df_test.drop('video_name', axis=1, inplace=True)

In [16]:
# 删除 video_id、user_id 列
df_test.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [17]:
inference_dataset = df_test
inference_dataset.shape

(2822180, 127)

# watch_label 预测

## 处理数据不均衡问题

In [18]:
items = list(Counter(watch_label).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / watch_label.shape[0]
print(dist)

[(0, 5176743), (1, 557421), (2, 314107), (3, 219188), (4, 172404), (5, 143001), (6, 125092), (7, 117749), (8, 138798), (9, 388521)]
[[0.         0.70402912]
 [1.         0.0758084 ]
 [2.         0.04271807]
 [3.         0.02980923]
 [4.         0.02344668]
 [5.         0.01944792]
 [6.         0.01701232]
 [7.         0.01601368]
 [8.         0.01887632]
 [9.         0.05283826]]


In [19]:
under_ss = np.array(items)
under_ss_thresh = under_ss[9, 1]  # 设置每个类别样本数目的上限
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[2, 1]  # 设置每个类别样本数据的下限
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [20]:
under_ss, over_ss

({0: 388521,
  1: 388521,
  2: 314107,
  3: 219188,
  4: 172404,
  5: 143001,
  6: 125092,
  7: 117749,
  8: 138798,
  9: 388521},
 {0: 388521,
  1: 388521,
  2: 314107,
  3: 314107,
  4: 314107,
  5: 314107,
  6: 314107,
  7: 314107,
  8: 314107,
  9: 388521})

In [21]:
del_idxs = pd.Index([], dtype=int)
for l, n in items:
    if n > under_ss_thresh:
        t_idxs = watch_label == l
        t_idxs = t_idxs.replace(False, np.nan).dropna().index  # 保留watch_label=l的行索引
        t_left_idxs = np.random.choice(t_idxs, under_ss_thresh, replace=False)  # 选择一部分保留，注意replace参数，为True时会重复采样
        t_del_idxs = t_idxs.difference(t_left_idxs)
        print(Counter(watch_label[t_del_idxs]))
                
        del_idxs = del_idxs.union(t_del_idxs)
del_idxs

Counter({0: 4788222})
Counter({1: 168900})


Int64Index([      1,       3,       8,       9,      11,      12,      13,
                 14,      15,      16,
            ...
            7353006, 7353007, 7353008, 7353010, 7353014, 7353016, 7353019,
            7353020, 7353022, 7353023],
           dtype='int64', length=4957122)

In [23]:
Counter(watch_label)

Counter({2: 314107,
         0: 5176743,
         5: 143001,
         4: 172404,
         1: 557421,
         9: 388521,
         3: 219188,
         8: 138798,
         7: 117749,
         6: 125092})

In [22]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_wl = np.delete(watch_label.values, del_idxs, axis=0)
resampled_data.shape, resampled_wl.shape

((2395902, 127), (2395902,))

In [64]:
Counter(resampled_wl)

Counter({2: 314107,
         5: 143001,
         4: 172404,
         0: 388521,
         9: 388521,
         3: 219188,
         1: 388521,
         8: 138798,
         7: 117749,
         6: 125092})

In [22]:
# 速度太慢，难以忍受！
nm  = TomekLinks()
smt = SMOTE(sampling_strategy=over_ss)

In [1]:
%%time
X_r, y_r = nm.fit_resample(dataset, watch_label)

NameError: name 'nm' is not defined

In [1]:
%%time
X_r, y_r = smt.fit_resample(resampled_data, resampled_wl)

NameError: name 'smt' is not defined

In [23]:
# 将采样后的数据重装回 DataFrame
data = pd.DataFrame(resampled_data, columns=dataset.columns)
watch_label_res = pd.Series(resampled_wl)
data.shape, watch_label_res.shape

((2395902, 127), (2395902,))

In [24]:
test_rate = 0.15
train_idx, test_idx = train_test_split(data.index, test_size=test_rate, random_state=0)
train_idx.shape, test_idx.shape

((2036516,), (359386,))

## XGBoost

### 训练模型

In [25]:
X_train = data.iloc[train_idx]
X_test  = data.iloc[test_idx]

In [26]:
y_train = watch_label_res.iloc[train_idx]
y_test  = watch_label_res.iloc[test_idx]

In [27]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2036516, 127), (2036516,), (359386, 127), (359386,))

In [29]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(1.887s)


In [30]:
# setup parameters for xgboost
param = {
    'objective': 'multi:softmax',
    'eta': 0.1,
    'nthread': 8,
    'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'eval_metric': ['mlogloss', 'auc', 'merror'],
    'max_depth': 9,
    'min_child_weight': 9,
    'gamma': 0.2,
    'subsample': 0.9,
    'colsample_bytree': 0.6,
    'reg_alpha': 0
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

In [31]:
num_round = 300
t0 = time()
eval_result = {}
def decay_eta(nth):
    etas = [.1, .05, .03, .01]
    return etas[(nth // 60) % len(etas)]

wl_bst_sm = xgb.train(param, xg_train, num_round, watchlist, evals_result=eval_result, )
#                       callbacks=[xgb.callback.LearningRateScheduler(decay_eta)])
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-mlogloss:2.27344	train-auc:0.63151	train-merror:0.74003	test-mlogloss:2.27419	test-auc:0.62399	test-merror:0.74374
[1]	train-mlogloss:2.24973	train-auc:0.63802	train-merror:0.73721	test-mlogloss:2.25121	test-auc:0.62846	test-merror:0.74120
[2]	train-mlogloss:2.22957	train-auc:0.64220	train-merror:0.73564	test-mlogloss:2.23166	test-auc:0.63181	test-merror:0.73919
[3]	train-mlogloss:2.21234	train-auc:0.64447	train-merror:0.73462	test-mlogloss:2.21506	test-auc:0.63333	test-merror:0.73892
[4]	train-mlogloss:2.19784	train-auc:0.64638	train-merror:0.73406	test-mlogloss:2.20113	test-auc:0.63465	test-merror:0.73809
[5]	train-mlogloss:2.18420	train-auc:0.64777	train-merror:0.73361	test-mlogloss:2.18809	test-auc:0.63550	test-merror:0.73797
[6]	train-mlogloss:2.17214	train-auc:0.64898	train-merror:0.73315	test-mlogloss:2.17665	test-auc:0.63623	test-merror:0.73750
[7]	train-mlogloss:2.16168	train-auc:0.64985	train-merror:0.73295	test-mlogloss:2.16676	test-auc:0.63660	test-merror:0.73753


In [46]:
eval_result

{'train': OrderedDict([('mlogloss',
               [2.273391,
                2.249271,
                2.228938,
                2.211438,
                2.196323,
                2.183143,
                2.171514,
                2.16128,
                2.15201,
                2.14378,
                2.136391,
                2.129838,
                2.12384,
                2.118412,
                2.113522,
                2.109029,
                2.105014,
                2.101309,
                2.0979,
                2.094819,
                2.092041,
                2.089248,
                2.086848,
                2.084486,
                2.082175,
                2.080239,
                2.078318,
                2.076628,
                2.074873,
                2.073293,
                2.071798,
                2.070462,
                2.069056,
                2.067713,
                2.066565,
                2.065228,
                2.064006,
        

In [32]:
# get prediction
pred = wl_bst_sm.predict(xg_test)
# pred = pred.astype(np.uint8)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.7267812324353202


In [33]:
eval_s = wl_bst_sm.eval(xg_test)
# eval_dict = eval_str_2_dict(eval_s)
eval_s

'[0]\teval-mlogloss:2.064507\teval-auc:0.649186\teval-merror:0.726781'

In [34]:
weights = np.arange(0, 1, 0.1)
aucs = auc(y_test.astype(np.uint8), pred.astype(np.uint8), np.arange(param['num_class']))
# aucs[aucs == 0.5] = 0
w_auc = (aucs * weights).sum()
aucs, w_auc

(array([0.6676927 , 0.59549384, 0.51434806, 0.50466707, 0.5041683 ,
        0.50455988, 0.50658242, 0.50320763, 0.51051575, 0.63840191]),
 2.4069354911900125)

In [35]:
report = metrics.classification_report(list(y_test), list(pred))

In [36]:
print(report)

              precision    recall  f1-score   support

           0       0.36      0.51      0.42     58076
           1       0.26      0.43      0.32     58168
           2       0.18      0.10      0.13     46949
           3       0.17      0.02      0.03     33008
           4       0.19      0.01      0.02     26050
           5       0.20      0.01      0.02     21748
           6       0.23      0.02      0.03     18650
           7       0.19      0.01      0.02     17723
           8       0.24      0.03      0.05     20595
           9       0.26      0.63      0.37     58419

    accuracy                           0.27    359386
   macro avg       0.23      0.18      0.14    359386
weighted avg       0.24      0.27      0.21    359386



### 调参

In [15]:
from xgboost import XGBClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from collections.abc import Iterable
from sklearn.model_selection import StratifiedKFold

In [16]:
base_param = {  # 基本参数，不需要调参
    'objective': 'multi:softmax',
    'eta': 0.1,
    'nthread': 8,
    'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'eval_metric': ['mlogloss', 'auc', 'merror']
} 
# 需要调参的参数
ps1 = {  
    'max_depth': list(range(5, 14, 2)),
    'min_child_weight': list(range(1, 10, 2)),
}

ps2 = {
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
}

ps3 = {
    'subsample':  [i/10.0 for i in range(6,11,1)], 
    'colsample_bytree':  [i/10.0 for i in range(6,11,1)] 
}

ps4 = {'reg_alpha': [0, 0.1, 0.2, 0.5, 1, 1.5, 2, 4]}

In [529]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(5.092s)


In [70]:
def myproduct(*iterables):
    n = len(iterables)
#     print(iterables)
    if n == 0:
        return None 
    
    ret = []
    ret.extend([[e] for e in iterables[0].copy()])
    if n == 1:
        return ret

    # 将需要调参的参数进行组合，即笛卡尔乘积。类似于sklearn中的 ParameterGrid
    for k in range(1, n):
        v = iterables[k].copy()
        l = len(ret)
        ret = [ret[i%l].copy() for i in range(len(v) * len(ret))]
        for i, e in enumerate(ret):
            e.append(v[i // l])
    return ret

def compose_param_grid(grid, base):
    items = list(grid.items())
    iterables = [item[1] for item in items]
    keys = [item[0] for item in items]

    ret = myproduct(*iterables)
    com_ps = [dict(zip(keys, e)) for e in ret]


    all_params = [base.copy() for _ in range(len(com_ps))] 
    for i in range(len(com_ps)):
        all_params[i].update(com_ps[i])
        
    return all_params

In [26]:
base = base_param.copy()
base.update({'max_depth': 9, 'min_child_weight': 9})
base.update({'gamma': .2})
grids = [ps3, ps4]

rets = []
for grid in grids:
    params = compose_param_grid(grid, base)
    print(f"开始搜索：{dict_2_str(grid)}\n待搜索的参数组合数量：{len(params)}")
    ret = gridsearch_cv_xgb(data.values, watch_label_res.values, params, n_round=200, verbose_eval=False, n_class=10)
    arr = np.array([[-e['eval-merror'] for e in ret], 
                    [-e['eval-mlogloss'] for e in ret],
                    [e['eval-auc'] for e in ret], 
                    [e['w_auc'] for e in ret]], dtype=np.float32)
    opt_idxs = list(set(arr.argmax(axis=1)))
    for i in opt_idxs:
        print(dict_2_str(ret[i]['param']))
    opt_idx = opt_idxs[0]
    opt_param = ret[opt_idx]['param']
    base.update(opt_param)
    rets.append(ret)
    
    ks = list(grid.keys())
    ks = '-'.join(ks)
    with open(f"./logs/{ks}-watch_label-{int(time())}.pkl", "wb") as f:
        pickle.dump(ret, f)
    with open(f"./logs/{ks}-watch_label-{int(time())}.md", "w") as f:
        lines = []
        for e in ret:
            line = metric_2_str(e)
            lines.append(line)
            lines.append('\n')
        f.write("\n".join(lines))

print(f"找到的最棒的参数是：\n{dict_2_str(base)}")

开始搜索：colsample_bytree=[0.6, 0.7, 0.8, 0.9, 1.0], subsample=[0.6, 0.7, 0.8, 0.9, 1.0]
待搜索的参数组合数量：25
1 / 25: 200-rounds Training finished param={'objective': 'multi:softmax', 'eta': 0.1, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['mlogloss', 'auc', 'merror'], 'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.2, 'subsample': 0.6, 'colsample_bytree': 0.6} ...		(820.946s)
2 / 25: 200-rounds Training finished param={'objective': 'multi:softmax', 'eta': 0.1, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['mlogloss', 'auc', 'merror'], 'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.2, 'subsample': 0.7, 'colsample_bytree': 0.6} ...		(816.416s)
3 / 25: 200-rounds Training finished param={'objective': 'multi:softmax', 'eta': 0.1, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['mlogloss', 'auc', 'merror'], 'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.2, 'subsample': 0.

In [43]:
base

{'objective': 'multi:softmax',
 'eta': 0.1,
 'nthread': 8,
 'num_class': 10,
 'gpu_id': 0,
 'tree_method': 'gpu_hist',
 'eval_metric': ['mlogloss', 'auc', 'merror'],
 'max_depth': 9,
 'min_child_weight': 9,
 'gamma': 0.2,
 'subsample': 0.9,
 'colsample_bytree': 0.6,
 'reg_alpha': 0}

In [None]:
gridsearch_results = gridsearch_xgb(all_params, xg_train, xg_test)

In [None]:
gs_cv_results_sh = gridsearch_cv_xgb(data.values, watch_label_res.values, all_params, n_round=200, verbose_eval=False, n_class=10)

In [584]:
arr = np.array([[-e['test_error'] for e in gridsearch_results], [e['w_auc'] for e in gridsearch_results]], dtype=np.float32)
opt_idxs = arr.argmax(axis=1)
if opt_idxs[0] != opt_idxs[1]:
     warnings.warn(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs}。选择误差最小的模型 : {opt_idxs[0]}")

opt_idx = opt_idxs[0]

ValueError: 最小误差与最大AUC对应的模型不一致 : [93 19]

In [583]:
gridsearch_results[93]

({'test_error': 0.7289070620796754,
  'aucs': array([0.57555597, 0.58567653, 0.50347593, 0.50017473, 0.50017414,
         0.5000983 , 0.50292636, 0.50040085, 0.50878295, 0.60767447]),
  'w_auc': 2.365403849959146,
  'report': '              precision    recall  f1-score   support\n\n           0       0.33      0.19      0.24     43749\n           1       0.29      0.68      0.41    111616\n           2       0.23      0.01      0.03     62829\n           3       0.13      0.00      0.00     43872\n           4       0.15      0.00      0.00     34228\n           5       0.18      0.00      0.00     28926\n           6       0.33      0.01      0.01     25061\n           7       0.16      0.00      0.00     23400\n           8       0.31      0.02      0.04     27750\n           9       0.24      0.56      0.34     77663\n\n    accuracy                           0.27    479094\n   macro avg       0.23      0.15      0.11    479094\nweighted avg       0.24      0.27      0.18    479094\

In [588]:
opt_idx = opt_idxs[0]
opt_param = all_params[opt_idx]
print(gridsearch_results[opt_idx]['report'])

              precision    recall  f1-score   support

           0       0.33      0.19      0.24     43749
           1       0.29      0.68      0.41    111616
           2       0.23      0.01      0.03     62829
           3       0.13      0.00      0.00     43872
           4       0.15      0.00      0.00     34228
           5       0.18      0.00      0.00     28926
           6       0.33      0.01      0.01     25061
           7       0.16      0.00      0.00     23400
           8       0.31      0.02      0.04     27750
           9       0.24      0.56      0.34     77663

    accuracy                           0.27    479094
   macro avg       0.23      0.15      0.11    479094
weighted avg       0.24      0.27      0.18    479094



In [557]:
y_test.shape

(479094,)

In [None]:
gridsearch_cv_xgb(data.values, watch_label_res, all_params)

In [496]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 8
param['nthread'] = 8
param['num_class'] = 10
# param['gpu_id'] = 0
# param['tree_method'] = 'gpu_hist'

In [None]:
%%time
cv_res= xgb.cv(param, cv_data, num_boost_round=200,early_stopping_rounds=30,nfold=3, metrics='auc',show_stdv=True)
print(cv_res)

## Stacking

In [28]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.svm import SVC

In [39]:
# 第一层模型参数
first_layer_params = {
#     LogisticRegression : {
#         'C' : 10,
#         'random_state': 0
#     },
    XGBRFClassifier : {
        'n_jobs': 4,
        'n_estimators': 200,
         #'max_features': 0.2,
        'colsample_bytree': 0.6,
        'max_depth': 8,
        'learning_rate': .1,
        'verbosity': 0,
        'gpu_id': 1,
    },
    ExtraTreesClassifier : {
        'n_jobs': -1,
        'n_estimators':200,
        #'max_features': 0.5,
        'max_depth': 8,
        'min_samples_leaf': 2,
        'verbose': 0
    },
    AdaBoostClassifier : {
        'n_estimators': 200,
        'learning_rate' : 0.75
    },
#     GradientBoostingClassifier : { # 太慢了
#         'n_estimators': 200,
#          #'max_features': 0.2,
#         'max_depth': 5,
#         'min_samples_leaf': 2,
#         'verbose': 0
#     },
#     SVC : { # 太慢了
#         'kernel' : 'rbf',
#         'C' : 0.025,
#         'probability': True
#     } 
}


second_layer_params = {
    XGBClassifier : {
        'objective': 'multi:softmax',
        'eta': 0.1,
        'nthread': 8,
        'num_class': 10,
        'gpu_id': 0,
        'tree_method': 'gpu_hist',
        'eval_metric': ['mlogloss', 'auc', 'merror'],
        'max_depth': 9,
        'min_child_weight': 9,
        'gamma': 0.2,
        'subsample': 0.9,
        'colsample_bytree': 0.6,
        'reg_alpha': 0,
        'n_estimators': 200,
    }
}

In [40]:
clfs = []
for c, p in first_layer_params.items():
    clfs.append(c(**p))

meta = XGBClassifier(**second_layer_params[XGBClassifier])
sclf = StackingClassifier(classifiers=clfs, 
                          meta_classifier=meta)

In [None]:
%%time
sclf.fit(X_train, y_train)



In [35]:
tmp = sclf.predict(X_test)

In [38]:
Counter(tmp), Counter(y_test)

(Counter({9: 156864,
          0: 97583,
          3: 1479,
          1: 86151,
          2: 14444,
          6: 829,
          7: 368,
          5: 400,
          4: 530,
          8: 738}),
 Counter({7: 17483,
          6: 18603,
          9: 58127,
          3: 32872,
          2: 47359,
          4: 26076,
          0: 58260,
          1: 58161,
          8: 20959,
          5: 21486}))

In [None]:
%%time
clf = XGBRFClassifier(**first_layer_params[XGBRFClassifier])
clf.fit(X_train, y_train)



In [None]:
clf.predict(X_test)

In [157]:
clf.predict_proba(X_test)

array([[0.10100084, 0.10091538, 0.10042646, 0.09997695, 0.09962477,
        0.09947018, 0.09939709, 0.09926751, 0.09943457, 0.10048625]])

# is_share 预测

## XGBoost

### 处理数据不均衡问题

In [37]:
items = list(Counter(is_share).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / is_share.shape[0]
print(dist)

[(0, 7338705), (1, 14319)]
[[0.         0.99805264]
 [1.         0.00194736]]


In [38]:
under_ss = np.array(items)
under_ss_thresh = under_ss[1, 1]
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[1, 1]
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [39]:
under_ss, over_ss

({0: 14319, 1: 14319}, {0: 14319, 1: 14319})

In [40]:
idxs = is_share == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(7338705,)

In [74]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((7324386,), (14319,))

In [75]:
dataset['watch_label'] = watch_label
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_sh = np.delete(is_share.values, del_idxs, axis=0)
resampled_data.shape, resampled_sh.shape

((28638, 128), (28638,))

In [76]:
# 将采样后的数据重装会DataFrame
data_sh = pd.DataFrame(resampled_data, columns=dataset.columns)
del dataset['watch_label']
is_share_res = pd.Series(resampled_sh)
data_sh.shape, is_share_res.shape

((28638, 128), (28638,))

In [77]:
test_rate = .2
train_idx, test_idx = train_test_split(data_sh.index, test_size=test_rate, random_state=1)
train_idx.shape, test_idx.shape

((22910,), (5728,))

### 训练模型

In [78]:
X_train_sh = data_sh.iloc[train_idx]
X_test_sh  = data_sh.iloc[test_idx]

In [79]:
y_train_sh = is_share_res.iloc[train_idx]
y_test_sh  = is_share_res.iloc[test_idx]

In [80]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(0.020s)


In [81]:
# setup parameters for xgboost
param_sh = {
    'objective': 'binary:hinge',
    'eta': 0.1,
    'nthread': 8,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'eval_metric': ['logloss', 'auc', 'error'],
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0.1,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'reg_alpha': 0
}

# use softmax multi-class classification
# param_sh['objective'] = 'binary:hinge'
# scale weight of positive examples
# param_sh['eta'] = 0.1
# param_sh['max_depth'] = 6
# param_sh['nthread'] = 4
# param_sh['gpu_id'] = 0
# param_sh['tree_method'] = 'gpu_hist'
# param_sh['min_child_weight'] = 7


watchlist = [(xg_train_sh, 'train'), (xg_test_sh, 'test')]

In [82]:
num_round = 600
t0 = time()
sh_bst_sm = xgb.train(param_sh, xg_train_sh, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[1]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[2]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[3]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[4]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[5]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[6]	train-logloss:18.39978	train-auc:0.50000	train-error:0.49943	test-logloss:18.50429	test-auc:0.50000	test-error:0.50227
[7]	train-logloss:17.54910	train-auc:0.52312	train-error:0.47634	test-logloss:17.73891	test-auc:0.52066	test-error:0.48149
[8]	train-loglos

In [83]:
# get prediction
pred_sh = sh_bst_sm.predict(xg_test_sh)
error_rate = np.sum(pred_sh != y_test_sh) / y_test_sh.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.33414804469273746


In [84]:
Counter(pred_sh), Counter(y_test_sh)

(Counter({0.0: 2647, 1.0: 3081}), Counter({1: 2851, 0: 2877}))

In [85]:
report_sh = metrics.classification_report(list(y_test_sh), list(pred_sh))
print(report_sh)

              precision    recall  f1-score   support

           0       0.68      0.63      0.65      2877
           1       0.65      0.70      0.68      2851

    accuracy                           0.67      5728
   macro avg       0.67      0.67      0.67      5728
weighted avg       0.67      0.67      0.67      5728



In [86]:
aucs_sh = auc(y_test_sh.astype(np.uint8), pred_sh.astype(np.uint8), [0, 1])
aucs_sh

array([0.66602734, 0.66602734])

### 调参

In [39]:
base_param_sh = {  # 基本参数，不需要调参
    'objective': 'binary:hinge',
    'eta': 0.1,
    'nthread': 8,
#     'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'eval_metric': ['logloss', 'auc', 'error']
} 
# 需要调参的参数
ps1 = {  
    'max_depth': list(range(5, 10, 1)),
    'min_child_weight': list(range(1, 10, 2)),
}

ps2 = {
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
}

ps3 = {
    'subsample':  [i/10.0 for i in range(6,11,1)], 
    'colsample_bytree':  [i/10.0 for i in range(6,11,1)] 
}

ps4 = {'reg_alpha': [0, 0.1, 0.2, 0.5, 1, 1.5, 2, 4]}


# com_ps_sh = list(ParameterGrid(ps_sh))


# all_params_sh = [base_param_sh.copy() for _ in range(len(com_ps_sh))] 
# for i in range(len(com_ps_sh)):
#     all_params_sh[i].update(com_ps_sh[i])

# # print(com_ps_sh)
# print(all_params_sh.__len__())

In [550]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(24.924s)


In [41]:
base_sh = base_param_sh.copy()
grids_sh = [ps1, ps2, ps3, ps4]

rets_sh = []
for grid in grids_sh:
    params = compose_param_grid(grid, base_sh)
    print(f"开始搜索：{dict_2_str(grid)}\n待搜索的参数组合数量：{len(params)}")
    ret = gridsearch_cv_xgb(data_sh.values, is_share_res.values, params, n_round=150, verbose_eval=False, n_class=2)
    arr = np.array([[-e['eval-error'] for e in ret], 
                    [-e['eval-logloss'] for e in ret],
                    [e['eval-auc'] for e in ret], 
                    [e['w_auc'] for e in ret]], dtype=np.float32)
    opt_idxs = list(set(arr.argmax(axis=1)))
    for i in opt_idxs:
        print(dict_2_str(ret[i]['param']))
    opt_idx = opt_idxs[0]
    opt_param = ret[opt_idx]['param']
    base_sh.update(opt_param)
    rets_sh.append(ret)
    
    ks = list(grid.keys())
    ks = '-'.join(ks)
    with open(f"./logs/{ks}-is_share-{int(time())}.pkl", "wb") as f:
        pickle.dump(ret, f)
    with open(f"./logs/{ks}-is_share-{int(time())}.md", "w") as f:
        lines = []
        for e in ret:
            line = metric_2_str(e)
            lines.append(line)
            lines.append('\n')
        f.write("\n".join(lines))

print(f"找到的最棒的参数是：\n{dict_2_str(base_sh)}")

开始搜索：max_depth=[5, 6, 7, 8, 9], min_child_weight=[1, 3, 5, 7, 9]
待搜索的参数组合数量：25
1 / 25: 150-rounds Training finished param={'objective': 'binary:hinge', 'eta': 0.1, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['logloss', 'auc', 'error'], 'max_depth': 5, 'min_child_weight': 1} ...		(3.035s)
2 / 25: 150-rounds Training finished param={'objective': 'binary:hinge', 'eta': 0.1, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['logloss', 'auc', 'error'], 'max_depth': 6, 'min_child_weight': 1} ...		(4.271s)
3 / 25: 150-rounds Training finished param={'objective': 'binary:hinge', 'eta': 0.1, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['logloss', 'auc', 'error'], 'max_depth': 7, 'min_child_weight': 1} ...		(6.333s)
4 / 25: 150-rounds Training finished param={'objective': 'binary:hinge', 'eta': 0.1, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'eval_metric': ['logloss', 'auc', 'error'], 'max_depth': 8, 'min_child_wei

In [42]:
base_sh

{'objective': 'binary:hinge',
 'eta': 0.1,
 'nthread': 8,
 'gpu_id': 0,
 'tree_method': 'gpu_hist',
 'eval_metric': ['logloss', 'auc', 'error'],
 'max_depth': 5,
 'min_child_weight': 1,
 'gamma': 0.1,
 'subsample': 0.6,
 'colsample_bytree': 0.6,
 'reg_alpha': 0}

In [None]:
gridsearch_results_sh = gridsearch_xgb(all_params_sh, xg_train_sh, xg_test_sh, num_round=150, n_class=2, verbose_eval=False)

In [None]:
gs_cv_results_sh = gridsearch_cv_xgb(data_sh.values, is_share_res.values, all_params_sh, n_round=150, verbose_eval=False, n_class=2)

In [753]:
with open('greadsearch-cv-is_share.md', 'w') as f:
    for ret in performance:
        f.write(f"# {', '.join([f'{k}={v}' for k, v in ret[0].items()])}\n")
        for k, v in ret[1].items():
            is_break = '\n' if '\n' in str(df) else ''
            f.write(f"- {k} :{is_break} {v}\n\n")
        f.write(f"{'-'*50}\n\n\n")

In [765]:
mean_test_error = np.array([e[1]['mean_test_error'] for e in performance])
mean_test_error.argmin()

48

In [617]:
arr = np.array([[-e['test_error'] for e in gridsearch_results_sh], [e['aucs'][1] for e in gridsearch_results_sh]], dtype=np.float32)
opt_idxs_sh = arr.argmax(axis=1)
if opt_idxs_sh[0] != opt_idxs_sh[1]:
    warnings.warn(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs_sh}。选择误差最小的模型 : {opt_idxs_sh[0]}")

opt_idx_sh = opt_idxs_sh[0]
all_params_sh[opt_idx_sh]

{'objective': 'binary:hinge',
 'eta': 0.1,
 'nthread': 8,
 'gpu_id': 0,
 'tree_method': 'gpu_hist',
 'gamma': 0.3,
 'max_depth': 11,
 'min_child_weight': 9}

In [603]:
opt_idx_sh = opt_idxs_sh[0]
opt_idx_sh

146

In [619]:
print(gridsearch_results_sh[opt_idx_sh]['report'])

              precision    recall  f1-score   support

         0.0       0.54      0.44      0.49      3053
         1.0       0.50      0.60      0.54      2835

    accuracy                           0.52      5888
   macro avg       0.52      0.52      0.51      5888
weighted avg       0.52      0.52      0.51      5888



In [620]:
pd.DataFrame(gridsearch_results_sh)

Unnamed: 0,test_error,aucs,w_auc,report,model
0,0.503057,"[0.50379590202715, 0.50379590202715]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f35ba907130>
1,0.496943,"[0.508734635779073, 0.508734635779073]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32e4009580>
2,0.501189,"[0.5048543919272165, 0.5048543919272165]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f3583f4f4f0>
3,0.502717,"[0.502360357955947, 0.502360357955947]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f3574279550>
4,0.499321,"[0.5061017844072763, 0.5061017844072763]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32e4082730>
...,...,...,...,...,...
115,0.498132,"[0.5057747576472328, 0.5057747576472328]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4640>
116,0.497962,"[0.5046665869463117, 0.5046665869463118]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4850>
117,0.499490,"[0.5026636996830249, 0.5026636996830249]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a46a0>
118,0.491678,"[0.5107513874519006, 0.5107513874519005]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4670>


## Stacking

In [113]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.svm import SVC

In [None]:
# 第一层模型参数
first_layer_params = {
    LogisticRegression : {
        'C' : 10,
        'random_state': 0
    },
    RandomForestClassifier : {
        'n_jobs': -1,
        'n_estimators': 200,
         'warm_start': True, 
         #'max_features': 0.2,
        'max_depth': 8,
        'min_samples_leaf': 2,
        'max_features' : 'sqrt',
        'verbose': 0
    },
    ExtraTreesClassifier : {
        'n_jobs': -1,
        'n_estimators':200,
        #'max_features': 0.5,
        'max_depth': 8,
        'min_samples_leaf': 2,
        'verbose': 0
    },
    AdaBoostClassifier : {
        'n_estimators': 200,
        'learning_rate' : 0.75
    },
#     GradientBoostingClassifier : { # 太慢了
#         'n_estimators': 200,
#          #'max_features': 0.2,
#         'max_depth': 5,
#         'min_samples_leaf': 2,
#         'verbose': 0
#     },
#     SVC : { # 太慢了
#         'kernel' : 'rbf',
#         'C' : 0.025,
#         'probability': True
#     } 
}


second_layer_params = {
    XGBClassifier : {
        'objective': 'multi:softmax',
        'eta': 0.1,
        'nthread': 8,
        'num_class': 10,
        'gpu_id': 0,
        'tree_method': 'gpu_hist',
        'eval_metric': ['mlogloss', 'auc', 'merror'],
        'max_depth': 9,
        'min_child_weight': 9,
        'gamma': 0.2,
        'subsample': 0.9,
        'colsample_bytree': 0.6,
        'reg_alpha': 0,
        'n_estimators': 200,
    }
}

In [124]:
clfs = []
for c, p in first_layer_params.items():
    clfs.append(c(**p))

meta = XGBClassifier(**second_layer_params[XGBClassifier])
sclf = StackingClassifier(classifiers=clfs, 
                          meta_classifier=meta)

In [125]:
sclf.fit(X_train_sh, y_train_sh)



StackingClassifier(classifiers=[RandomForestClassifier(max_depth=8,
                                                       max_features='sqrt',
                                                       min_samples_leaf=2,
                                                       n_estimators=200,
                                                       n_jobs=-1,
                                                       warm_start=True),
                                ExtraTreesClassifier(max_depth=8,
                                                     min_samples_leaf=2,
                                                     n_estimators=200,
                                                     n_jobs=-1),
                                AdaBoostClassifier(learning_rate=0.75,
                                                   n_estimators=200),
                                GradientBoostingClassifier(max_depth=5,
                                                           min_samples_leaf=2,
  

In [126]:
sclf.score(X_test_sh, y_test_sh)

0.6674231843575419

In [120]:
%%time
clf = RandomForestClassifier(**first_layer_params[RandomForestClassifier])
clf.fit(X_train_sh, y_train_sh)

CPU times: user 13.1 s, sys: 112 ms, total: 13.2 s
Wall time: 987 ms


RandomForestClassifier(max_depth=8, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=200, n_jobs=-1, warm_start=True)

In [121]:
tmp = clf.predict(X_test_sh)
print(metrics.classification_report(list(y_test_sh), list(tmp)))

              precision    recall  f1-score   support

           0       0.66      0.61      0.63      2845
           1       0.64      0.69      0.66      2883

    accuracy                           0.65      5728
   macro avg       0.65      0.65      0.65      5728
weighted avg       0.65      0.65      0.65      5728



In [123]:
X_test_sh

Unnamed: 0,v_avg_watch_label_1,v_sum_watch_times_1,v_sum_watch_overs_1,v_sum_comment_times_1,v_sum_collect_times_1,v_sum_share_times_1,v_sum_quit_times_1,v_sum_skip_times_1,v_sum_watch_days_1,v_avg_watch_label_3,...,class_6,class_7,class_8,class_9,da_0,da_1,da_2,da_3,da_4,watch_label
17211,0.000000,4.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.958333,...,0.041922,0.041925,0.041925,0.041923,0.084701,0.084811,0.661892,0.084126,0.084470,2.0
12479,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.306212,0.037078,0.037070,0.037070,0.074563,0.075237,0.316654,0.074140,0.459407,0.0
2532,0.208611,1951.0,27.0,2.0,24.0,4.0,1866.0,0.0,1.0,0.213834,...,0.350094,0.041778,0.041778,0.041771,0.324958,0.083541,0.083541,0.083541,0.424418,0.0
28572,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.041922,0.041925,0.041925,0.041923,0.084701,0.084811,0.661892,0.084126,0.084470,0.0
19280,0.509397,3565.0,106.0,2.0,55.0,35.0,3082.0,0.0,1.0,0.548684,...,0.050000,0.050003,0.050009,0.050000,0.600000,0.100000,0.100000,0.100000,0.100000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11302,1.333333,123.0,10.0,0.0,0.0,1.0,88.0,0.0,1.0,1.124863,...,0.270403,0.036901,0.036893,0.271428,0.073786,0.074915,0.078284,0.073786,0.699228,9.0
4584,0.902047,684.0,30.0,1.0,5.0,1.0,531.0,0.0,1.0,1.040830,...,0.212644,0.031771,0.031764,0.409137,0.065150,0.328750,0.068145,0.064165,0.473790,3.0
26296,0.674253,32513.0,388.0,11.0,245.0,55.0,22996.0,0.0,1.0,0.725334,...,0.037483,0.037487,0.395917,0.037484,0.075851,0.075966,0.388781,0.383735,0.075666,0.0
9816,1.617686,12903.0,1328.0,34.0,174.0,46.0,8865.0,0.0,1.0,1.619248,...,0.248009,0.036880,0.036881,0.270608,0.238563,0.073745,0.073745,0.073745,0.540203,0.0


# 预测

In [87]:
test = inference_dataset
test_sh = inference_dataset.copy()
test = xgb.DMatrix(test.values, enable_categorical=True)

In [88]:
inference_dataset.shape, test.num_col()

((2822180, 127), 127)

In [89]:
bst_wl = wl_bst_sm  # gridsearch_results[opt_idx]['model']  
bst_sh = sh_bst_sm  # gridsearch_results_sh[opt_idx_sh]['model']  

In [90]:
wl = bst_wl.predict(test)
test_sh['watch_label'] = wl
test_sh = xgb.DMatrix(test_sh.values, enable_categorical=True)
sh = bst_sh.predict(test_sh)
Counter(wl), Counter(Counter(sh))

(Counter({1.0: 519693,
          0.0: 1004985,
          9.0: 1032586,
          2.0: 216270,
          3.0: 20171,
          4.0: 6656,
          6.0: 3961,
          5.0: 4722,
          7.0: 3072,
          8.0: 10064}),
 Counter({1.0: 970873, 0.0: 1851307}))

In [91]:
test_df['watch_label'] = wl.astype(np.uint8)
test_df['is_share'] = sh.astype(np.uint8)
test_df.shape

(2822180, 61)

In [92]:
submission = pd.DataFrame(test_df[['user_id', 'video_id', 'watch_label', 'is_share']])
submission

Unnamed: 0,user_id,video_id,watch_label,is_share
0,1688013,32645,1,1
1,4502598,41270,0,1
2,5585629,16345,9,0
3,1635520,28149,2,1
4,4160191,40554,1,1
...,...,...,...,...
2822175,5019057,18766,0,0
2822176,5019057,12968,0,0
2822177,4255762,21794,2,0
2822178,171497,21578,3,0


In [93]:
fn = f'../submission-{int(time())}.csv'
submission.to_csv(fn, index=False, sep=",")
print(f"new submission saved to {fn}")

new submission saved to ../submission-1629622213.csv


# 保存模型

In [94]:
version = 19
wl_model_name = f'wl_model_v{version}'
sh_model_name = f'sh_model_v{version}'
bst_wl.save_model(wl_model_name)
bst_sh.save_model(sh_model_name)

In [95]:
def write_log(log_name, info, log_path="./"):
    import datetime
    with open(os.path.join(log_path, log_name), 'w') as log:
        log.write(f"# {datetime.datetime.now().__str__()}\n")
        if info.get('comment', False):
            log.write(f"\n## Comment: \n")
            log.write(f"{info['comment']}\n")
            
        log.write(f"\n## model name: {info['model_name']}\n")
        log.write(f"- model save path : {info['model_save_path']}\n")
        
        log.write(f"\n## Data setup\n")
        log.write(f"- dataset.shape : {dataset.shape}\n")
        log.write(f"- dataset.columns : {dataset.columns}\n")
        log.write(f"- is resample : {info['is_resample']}\n")
        log.write(f"- Traing_Data.shape (watch_label)  : {X_train.shape}\n")
        log.write(f"- Testing_Data.shape (watch_label) : {X_test.shape}\n")
        log.write(f"- Traing_Data.shape (is_share)  : {X_train_sh.shape}\n")
        log.write(f"- Testing_Data.shape (is_share) : {X_test_sh.shape}\n")
        if info.get('is_resample', False):
            log.write(f"- Resampled class distribution (watch_label): \n{Counter(resampled_wl)}\n")
            log.write(f"- Resampled class distribution (is_share): \n{Counter(resampled_sh)}\n")
            
        log.write(f"\n## Model Params\n")
        log.write(f"- model params (watch_label) : \n{info['param_wl']}\n")
        log.write(f"- model params (is_share) : \n{info['param_sh']}\n")
        
        log.write(f"\n## Model's Performance\n")
        log.write(f"- Aucs (watch_label) : {info['aucs']}\n")
        log.write(f"- Weighted Aucs (watch_label) : {info['w_auc']}\n")
        log.write(f"- Aucs (is_share) : {info['aucs_sh']}\n")
        
        log.write(f"- Classification Report (watch_label) : \n\n{info['report']}\n")
        log.write(f"- Classification Report (is_share) : \n\n{info['report_sh']}\n")
        
        log.flush()
        
    pass

In [96]:
param_wl = param  # all_params[opt_idx]
param_sh = param_sh  # all_params[opt_idx_sh]

aucs = aucs  # gridsearch_results[opt_idx]['aucs']
w_auc = w_auc  # gridsearch_results[opt_idx]['w_auc']
aucs_sh = aucs_sh  # gridsearch_results_sh[opt_idx]['aucs']

report = report  # gridsearch_results[opt_idx]['report']
report_sh = report_sh  # gridsearch_results_sh[opt_idx]['report']

In [97]:
log_name = f"log_v{version}.md"
info = {'is_resample': True, 'model_name': [wl_model_name, sh_model_name], 'model_save_path': os.getcwd(),
        'comment': f"特征：基础特征+用户和视频的统计量特征，除此之外，is_share的特征还加上了watch_label，训练集使用的真实的watch_label，测试集使用预测的watch_label。\n数据集划分：watch_label的测试集为.15，is_share的测试集为.2。\nwatch_label训练300rounds，is_share训练600rounds。\n此次生成的提交是：{fn}。官方测评得分：xxx😐",
        'param_wl': param_wl, 'param_sh': param_sh, 'aucs': aucs, 'w_auc': w_auc, 'aucs_sh': aucs_sh, 
        'report': report, 'report_sh': report_sh}
write_log(log_name, info, log_path="./logs")

# 服务器间同步文件

## 推向Digix服务器

In [87]:
!scp ./models.ipynb digix@49.123.120.71:/home/digix/digix/Models/models.ipynb 

ssh: connect to host 49.123.120.71 port 22: No route to host
lost connection


In [None]:
!scp ./ensemble.ipynb digix@49.123.120.71:/home/digix/digix/Models/ensemble_from_gzy.ipynb 

digix@49.123.120.71's password: 

In [599]:
!scp ./log_*.md digix@49.123.120.71:/home/digix/digix/Models/

ssh: connect to host 49.123.120.71 port 22: No route to host
lost connection


In [472]:
!scp ../explore-data.ipynb digix@49.123.120.71:/home/digix/digix/explore-data.ipynb 

explore-data.ipynb                            100%  306KB  10.6MB/s   00:00    


In [650]:
!scp ../2021_3_data/traindata/video_features_data/video_features.jay digix@49.123.120.71:/home/digix/digix/dataset/new_video_features.jay

video_features.jay                            100% 9035KB  11.1MB/s   00:00    


## 从Digix服务器拉数据

In [1]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/LightGBM.ipynb ./LightGBM.ipynb

digix@49.123.120.71's password: 


In [2]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/feature_engineering.ipynb ./feature_engineering.ipynb

scp: /home/digix/digix/Models/feature_engineering.ipynb: No such file or directory


In [3]:
!scp -r digix@49.123.120.71:/home/digix/digix/Models/Feature_Engineering/  ./

utils.py                                      100% 3860     2.6MB/s   00:00    
data_analysis.ipynb                           100% 6566KB  11.2MB/s   00:00    
__init__.py                                   100%    0     0.0KB/s   00:00    
__init__.cpython-36.pyc                       100%  139   128.3KB/s   00:00    
utils.cpython-36.pyc                          100% 4120     2.6MB/s   00:00    
video_data.ipynb                              100%   55KB   1.7MB/s   00:00    
user_data-checkpoint.ipynb                    100%  202KB  10.1MB/s   00:00    
data_analysis-checkpoint.ipynb                100% 6554KB  11.0MB/s   00:00    
utils-checkpoint.py                           100% 3860     2.4MB/s   00:00    
video_data-checkpoint.ipynb                   100%   17KB   1.4MB/s   00:00    
user_data.ipynb                               100%  202KB  10.3MB/s   00:00    


In [168]:
!scp -r digix@49.123.120.71:/home/digix/digix/dataset/traindata/video_features_data/video_status.csv ../2021_3_data/traindata/video_features_data/

video_status.csv                              100% 2008KB   9.1MB/s   00:00    
user_status.csv                               100%  138MB   9.3MB/s   00:14    


In [279]:
!scp -r digix@49.123.120.71:/home/digix/digix/dataset/traindata/user_features_data/user_status.csv ../2021_3_data/traindata/user_features_data/

user_status.csv                               100%  168MB  11.2MB/s   00:14    


In [32]:
!scp -r digix@49.123.120.71:/home/digix/digix/Models/MMoE/MMoe_DouLoss.ipynb  ./MMoe_DouLoss.ipynb

digix@49.123.120.71's password: 
