In [5]:
import numpy as np
from sklearn import metrics
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from time import time
import pandas as pd
import xgboost as xgb
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
import datatable as dt
import warnings

from tools import *

# 给定预测标签，计算AUC
使用OVR的策略计算每个类别的AUC
过程：
- 选择类别i作为正类，其他类别作为负类
- 将真实标签中不等于i的标记为0，等于i的标记为1
- 将预测标签中不等于i的标记为0，等于ide标记为1
- 计算混淆矩阵
- 计算(fpr, tpr)
- 计算AUC

# 加载数据
训练数据加载过程：
1. 分别加载处理好的用户特征和视频特征，以及整合的用户历史行为数据；
2. 从用户历史行为数据中筛掉在视频特征中没出现过的video_id；
3. 将行为数据中的user_id、video_id替换为对应用户/视频的特征
4. 根据不同的任务划分为`watch_label`、`is_share`的数据集

推断时，类似于上述过程拼接数据。

## 读取数据

In [2]:
base_dir = "../2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

### 基础特征与附加特征合并

In [280]:
video_status = dt.fread(os.path.join(train_data_dir, "video_features_data/video_status.csv"))
user_status = dt.fread(os.path.join(train_data_dir, "user_features_data/user_status.csv"))

In [282]:
tab_user = dt.fread(os.path.join(train_data_dir, "user_features_data/user_features.jay"))
tab_video = dt.fread(os.path.join(train_data_dir, "video_features_data/video_features.jay"))

In [283]:
video_status.key = 'video_id'
video_ws = tab_video[:, :, join(video_status)]

In [286]:
user_status.key = 'user_id'
user_ws = tab_user[:, :, join(user_status)]

In [288]:
video_ws.to_jay(os.path.join(train_data_dir, "video_features_data/video_features_with_status.jay"))

In [289]:
user_ws.to_jay(os.path.join(train_data_dir, "user_features_data/user_features_with_status.jay"))

In [287]:
user_ws

Unnamed: 0_level_0,user_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,gender_0,…,average_watch_label,sum_watch_times,sum_comment_times,sum_collect_times,sum_share_times
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,Unnamed: 11_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1.757e+06,0,0,0,1,0,0,0,0,0,…,0,0,0,0,0
1,17938,1,0,0,0,0,0,0,0,1,…,0.0967742,3,0,0,0
2,4.26352e+06,0,1,0,0,0,0,0,0,1,…,0.204545,2,0,0,0
3,1.4116e+06,0,0,0,1,0,0,0,0,1,…,0,0,0,0,0
4,3.99224e+06,0,0,1,0,0,0,0,0,1,…,0,0,0,0,0
5,4.0116e+06,0,0,1,0,0,0,0,0,1,…,0,0,0,0,0
6,4.78556e+06,0,0,0,0,0,1,0,0,0,…,0,0,0,0,0
7,5.11036e+06,1,0,0,0,0,0,0,0,1,…,0,0,0,0,0
8,1.3212e+06,0,0,0,0,0,1,0,0,0,…,0,0,0,0,0
9,3.20698e+06,1,0,0,0,0,0,0,0,0,…,0,0,0,0,0


### 通过 .npz 读取数据

#### 单表读取后合并

In [20]:
%%time
# 单独读取每个文件再进行合并
user_df = read_npz_to_df(os.path.join(train_data_dir, "user_features_data/user_features.npz"), data_name='features', column_name='columns')
video_df = read_npz_to_df(os.path.join(train_data_dir, "video_features_data/video_features.npz"), data_name='features')
action_df = read_npz_to_df(os.path.join(train_data_dir, "all_actions.npz"), data_name='data')

CPU times: user 3.6 s, sys: 2.4 s, total: 6 s
Wall time: 12.1 s


In [797]:
# 因为将字符串保存到 .npz时会使dtype为object，重新读回DataFrame时各个列的数据类型均为 object，所以先转换类型
dtypes = dict(zip(video_df.columns, [np.float32] * video_df.shape[1]))
dtypes.update({'video_name': np.str})
video_df = video_df.astype(dtypes)

In [802]:
%%time
# 合并各个表
df_train = merge_user_video_action(user_df, video_df, action_df)
df_train.shape

CPU times: user 1min 29s, sys: 5.88 s, total: 1min 35s
Wall time: 40.1 s


(7353024, 76)

In [805]:
np.savez(os.path.join(train_data_dir, "train.npz"), data=df_train.to_pandas().values, columns=df_train.to_pandas().columns.tolist())

In [777]:
%%time
test_df = load_table(os.path.join(test_data_dir, "test.csv"), ftype="csv")

CPU times: user 626 ms, sys: 0 ns, total: 626 ms
Wall time: 721 ms


In [779]:
%%time
df_test = merge_user_video_action(user_df, video_df, test_df)
df_test.shape

CPU times: user 3min 8s, sys: 12.8 s, total: 3min 20s
Wall time: 51.2 s


(2822180, 73)

#### 读取合并好的数据

In [806]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.npz")
df_train = read_npz_to_df(path, data_name='data')
df_train.shape

CPU times: user 3min 2s, sys: 38.4 s, total: 3min 40s
Wall time: 3min 41s


(7353024, 76)

In [810]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.npz")
df_test = read_npz_to_df(path, data_name='data')
df_test.shape

CPU times: user 1min 37s, sys: 36.9 s, total: 2min 14s
Wall time: 5min 49s


(2822180, 73)

### 通过 .jay 文件读取

#### 单表读取后合并

In [3]:
with_status = False
if with_status:
    user_features_name = "user_features_with_status"
    video_features_name = "video_features_with_status"
else:
    user_features_name = "user_features"
    video_features_name = "video_features"
    
p_user = os.path.join(train_data_dir, f"user_features_data/{user_features_name}.jay")
p_video = os.path.join(train_data_dir, f"video_features_data/{video_features_name}.jay")

In [4]:
%%time
## 使用datatable 加载训练数据
p_act = os.path.join(train_data_dir, "all_actions.jay")

df_train, others = load_train_test_data(None, pre_merged=False, return_others=True,
                           **{"p_user": p_user, "p_video": p_video, "p_action": p_act})
user_df = others['user']
video_df = others['video']
action_df = others['action']
df_train.shape

CPU times: user 2min 49s, sys: 12.5 s, total: 3min 1s
Wall time: 57.8 s


(7353024, 76)

In [466]:
tt = df_train.to_pandas()
np.savez(os.path.join(train_data_dir, "train"), data=tt.values, columns=tt.columns.tolist())

In [371]:
%%time
# p_user = os.path.join(train_data_dir, "user_features_data/user_features.jay")
# p_video = os.path.join(train_data_dir, "video_features_data/video_features.jay")
p_act = os.path.join(test_data_dir, "test.csv")

path = os.path.join(test_data_dir, "test.jay")
kwargs = {"p_user": p_user, "p_video": p_video, "p_action": p_act}

df_test, others = load_train_test_data(None, pre_merged=False, return_others=True, **kwargs)
test_df = others['action']
df_test.shape

CPU times: user 1min 9s, sys: 2.91 s, total: 1min 12s
Wall time: 1.84 s


(2822180, 73)

In [474]:
action_df = action_df.to_pandas()
user_df = user_df.to_pandas()
video_df = video_df.to_pandas()

In [None]:
test_df = test_df.to_pandas()

#### 读取合并好后的数据

In [468]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.jay")
df_train = load_train_test_data(path, pre_merged=True)
df_train.shape

CPU times: user 0 ns, sys: 177 ms, total: 177 ms
Wall time: 184 ms


(7353024, 76)

In [454]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.jay")
df_test = load_train_test_data(path, pre_merged=True)
df_test.shape

CPU times: user 649 µs, sys: 46 µs, total: 695 µs
Wall time: 684 µs


(2822180, 72)

## 处理训练数据
可在此做一些预处理：
- 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
- 删除多余的列
- 调整列的顺序
- 改变列的数据类型


In [469]:
%%time
if isinstance(df_train, dt.Frame):
    df_train = df_train.to_pandas()
if isinstance(df_test, dt.Frame):
    df_test = df_test.to_pandas()

CPU times: user 29.8 s, sys: 9.82 s, total: 39.6 s
Wall time: 16.2 s


In [375]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Data columns (total 76 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int64  
 1   video_id             int64  
 2   is_watch             int64  
 3   is_share             int64  
 4   watch_label          int64  
 5   age_0                float64
 6   age_1                float64
 7   age_2                float64
 8   age_3                float64
 9   age_4                float64
 10  age_5                float64
 11  age_6                float64
 12  age_7                float64
 13  gender_0             float64
 14  gender_1             float64
 15  gender_2             float64
 16  gender_3             float64
 17  city_level_0         float64
 18  city_level_1         float64
 19  city_level_2         float64
 20  city_level_3         float64
 21  city_level_4         float64
 22  city_level_5         float64
 23  city_level_6         float64
 24

In [471]:
# 删除 video_name、is_watch 列
df_train.drop(['video_name', 'is_watch'], axis=1, inplace=True)

In [472]:
if 'action_df' not in dir():
    action_df = load_table(os.path.join(train_data_dir, "all_actions.jay")).to_pandas()
if 'video_df' not in dir():
    video_df = load_table(os.path.join(train_data_dir, "video_features_data/video_features.jay")).to_pandas()

In [476]:
action_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Data columns (total 5 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int64
 1   video_id     int64
 2   is_watch     int64
 3   is_share     int64
 4   watch_label  int64
dtypes: int64(5)
memory usage: 280.5 MB


In [477]:
# 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
idx1 = pd.Index(action_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'])
not_exists = idx1.difference(idx2)
not_exists

Int64Index([], dtype='int64')

In [478]:
%%time
# 将训练数据中未出现的视频剔除
n = 0
for vid in not_exists:
    tn = (df_train['video_id'] == vid).sum()
    df_train['video_id'].replace(vid, np.nan, inplace=True)
    n += tn

if n > 0:
    df_train.dropna(axis=0, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
print(n)

0
CPU times: user 1.25 ms, sys: 0 ns, total: 1.25 ms
Wall time: 1.07 ms


In [479]:
# 删除 video_id、user_id列
df_train.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [480]:
dataset = df_train
dataset.shape

(7353024, 72)

In [481]:
# 准备数据
watch_label = dataset.pop('watch_label').astype(np.uint8)
is_share = dataset.pop('is_share').astype(np.uint8)
watch_label.shape, is_share.shape, dataset.shape

((7353024,), (7353024,), (7353024, 70))

## 处理测试数据

In [482]:
if 'test_df' not in dir():
    test_df = pd.read_csv(os.path.join(test_data_dir, "test.csv"))

In [483]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 4 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int32
 1   video_id     int32
 2   watch_label  uint8
 3   is_share     uint8
dtypes: int32(2), uint8(2)
memory usage: 26.9 MB


In [484]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 72 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              float32
 1   video_id             float32
 2   age_0                float32
 3   age_1                float32
 4   age_2                float32
 5   age_3                float32
 6   age_4                float32
 7   age_5                float32
 8   age_6                float32
 9   age_7                float32
 10  gender_0             float32
 11  gender_1             float32
 12  gender_2             float32
 13  gender_3             float32
 14  city_level_0         float32
 15  city_level_1         float32
 16  city_level_2         float32
 17  city_level_3         float32
 18  city_level_4         float32
 19  city_level_5         float32
 20  city_level_6         float32
 21  city_level_7         float32
 22  device_name_0        float32
 23  device_name_1        float32
 24

In [485]:
# 删除 video_name 列
if 'video_name' in df_test.columns:
    df_test.drop('video_name', axis=1, inplace=True)

In [486]:
# 测试数据集中存在video_id没有在视频特征中出现
idx1 = pd.Index(test_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'].unique())
non_exists = idx1.difference(idx2)
non_exists

Int64Index([], dtype='int64')

In [487]:
%%time
t0 = time()
n = 0
for vid in not_exists:
    tn = (test_df['video_id'] == vid).sum()
#     df_test = action_df[action_df['video_id'] != vid]
    n += tn

print(f"在视频特征中不存在的video_id在测试数据集中出现的次数 = {n}\t\t(cost {time() - t0:.3f}s)")

在视频特征中不存在的video_id在测试数据集中出现的次数 = 0		(cost 0.000s)
CPU times: user 0 ns, sys: 430 µs, total: 430 µs
Wall time: 390 µs


In [488]:
# 删除 video_id、user_id 列
df_test.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [489]:
inference_dataset = df_test
inference_dataset.shape

(2822180, 70)

# watch_label 预测

## XGBoost

### 处理数据不均衡问题

In [490]:
items = list(Counter(watch_label).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / watch_label.shape[0]
print(dist)

[(0, 5176743), (1, 557421), (2, 314107), (3, 219188), (4, 172404), (5, 143001), (6, 125092), (7, 117749), (8, 138798), (9, 388521)]
[[0.         0.70402912]
 [1.         0.0758084 ]
 [2.         0.04271807]
 [3.         0.02980923]
 [4.         0.02344668]
 [5.         0.01944792]
 [6.         0.01701232]
 [7.         0.01601368]
 [8.         0.01887632]
 [9.         0.05283826]]


In [491]:
under_ss = np.array(items)
under_ss_thresh = under_ss[3, 1]  # 设置每个类别样本数目的上限
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[2, 1]  # 设置每个类别样本数据的下限
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [492]:
under_ss, over_ss

({0: 219188,
  1: 219188,
  2: 219188,
  3: 219188,
  4: 172404,
  5: 143001,
  6: 125092,
  7: 117749,
  8: 138798,
  9: 219188},
 {0: 219188,
  1: 219188,
  2: 219188,
  3: 219188,
  4: 219188,
  5: 219188,
  6: 219188,
  7: 219188,
  8: 219188,
  9: 219188})

In [493]:
idxs = watch_label == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(5176743,)

In [494]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留，注意replace参数，为True时会重复采样
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((4957555,), (219188,))

In [495]:
Counter(watch_label)

Counter({2: 314107,
         0: 5176743,
         5: 143001,
         4: 172404,
         1: 557421,
         9: 388521,
         3: 219188,
         8: 138798,
         7: 117749,
         6: 125092})

In [496]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_wl = np.delete(watch_label.values, del_idxs, axis=0)
resampled_data.shape, resampled_wl.shape

((2395469, 70), (2395469,))

In [497]:
Counter(resampled_wl)

Counter({2: 314107,
         5: 143001,
         4: 172404,
         1: 557421,
         0: 219188,
         9: 388521,
         3: 219188,
         8: 138798,
         7: 117749,
         6: 125092})

In [273]:
# 速度太慢，难以忍受！
nm  = TomekLinks(sampling_strategy=under_ss)
smt = SMOTE(sampling_strategy=over_ss)

In [None]:
t0 = time()
X_r, y_r = nm.fit_resample(resampled_data, pd.Series(resampled_wl))
print(f"Under Sampling finished ...\t\t({time()-t0:.3f}s)")

In [None]:
X_r, y_r = smt.fit_resample(X_r, y_r)

In [498]:
# 将采样后的数据重装回 DataFrame
data = pd.DataFrame(resampled_data, columns=dataset.columns)
watch_label_res = pd.Series(resampled_wl)
data.shape, watch_label.shape

((2395469, 70), (7353024,))

In [524]:
train_idx, test_idx = train_test_split(data.index, test_size=0.2, random_state=0)
train_idx.shape, test_idx.shape

((1916375,), (479094,))

### 训练模型

In [525]:
X_train = data.iloc[train_idx]
X_test  = data.iloc[test_idx]

In [526]:
y_train = watch_label_res.iloc[train_idx]
y_test  = watch_label_res.iloc[test_idx]

In [527]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1916375, 70), (1916375,), (479094, 70), (479094,))

In [405]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(1.521s)


In [406]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 11
param['min_child_weight'] = 7
param['nthread'] = 8
param['num_class'] = 10
param['gamma'] = .4
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
# param['scale_pos_weight'] = 2
watchlist = [(xg_train, 'train'), (xg_test, 'test')]

In [407]:
num_round = 200
t0 = time()
wl_bst_sm = xgb.train(param, xg_train, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-mlogloss:2.27322	test-mlogloss:2.27402
[1]	train-mlogloss:2.24908	test-mlogloss:2.25062
[2]	train-mlogloss:2.22869	test-mlogloss:2.23093
[3]	train-mlogloss:2.21114	test-mlogloss:2.21408
[4]	train-mlogloss:2.19593	test-mlogloss:2.19955
[5]	train-mlogloss:2.18259	test-mlogloss:2.18689
[6]	train-mlogloss:2.17102	test-mlogloss:2.17597
[7]	train-mlogloss:2.16074	test-mlogloss:2.16635
[8]	train-mlogloss:2.15146	test-mlogloss:2.15772
[9]	train-mlogloss:2.14330	test-mlogloss:2.15018
[10]	train-mlogloss:2.13587	test-mlogloss:2.14341
[11]	train-mlogloss:2.12925	test-mlogloss:2.13743
[12]	train-mlogloss:2.12326	test-mlogloss:2.13210
[13]	train-mlogloss:2.11784	test-mlogloss:2.12727
[14]	train-mlogloss:2.11293	test-mlogloss:2.12299
[15]	train-mlogloss:2.10846	test-mlogloss:2.11918
[16]	train-mlogloss:2.10435	test-mlogloss:2.11573
[17]	train-mlogloss:2.10067	test-mlogloss:2.11267
[18]	train-mlogloss:2.09729	test-mlogloss:2.10991
[19]	train-mlogloss:2.09423	test-mlogloss:2.10746
[20]	train

In [408]:
# get prediction
pred = wl_bst_sm.predict(xg_test)
# pred = pred.astype(np.uint8)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.7301364659127436


In [409]:
weights = np.arange(0, 1, 0.1)
aucs = auc(y_test.astype(np.uint8), pred.astype(np.uint8), np.arange(param['num_class']))
# aucs[aucs == 0.5] = 0
w_aucs = (aucs * weights).sum()
aucs, w_aucs

(array([0.57430505, 0.58606509, 0.50357122, 0.50041816, 0.50033007,
        0.50013865, 0.50273403, 0.50042983, 0.50980292, 0.60638754]),
 2.3651799768885837)

In [336]:
report = metrics.classification_report(list(y_test), list(pred))

In [410]:
print(report)

              precision    recall  f1-score   support

           0       0.87      0.54      0.67     43867
           1       0.41      0.73      0.52    111843
           2       0.41      0.16      0.23     62804
           3       0.46      0.13      0.20     43875
           4       0.45      0.09      0.15     34453
           5       0.55      0.10      0.17     28592
           6       0.46      0.05      0.08     24837
           7       0.56      0.07      0.12     23675
           8       0.39      0.07      0.12     27616
           9       0.31      0.75      0.43     77532

    accuracy                           0.40    479094
   macro avg       0.49      0.27      0.27    479094
weighted avg       0.46      0.40      0.34    479094



### 调参

In [643]:
from xgboost import XGBClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from collections.abc import Iterable
from sklearn.model_selection import StratifiedKFold

In [523]:
base_param = {  # 基本参数，不需要调参
    'objective': 'multi:softmax',
    'eta': 0.1,
    'nthread': 8,
    'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist'
} 
ps = {  # 需要调参的参数
    'max_depth': list(range(5, 14, 2)),
    'min_child_weight': list(range(2, 12, 2)),
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
}

ps2 = {
    'subsample':  [i/10.0 for i in range(6,11,1)], 
    'colsample_bytree':  [i/10.0 for i in range(6,11,1)] 
}

ps3 = {'reg_alpha': [0.1, 0.2, 0.5, 1, 1.5, 2, 4]}

# 将需要调参的参数进行组合，即笛卡尔乘积。类似于sklearn中的 ParameterGrid
items = list(ps.items())
iterables = [item[1] for item in items]
keys = [item[0] for item in items]

ret = myproduct(*iterables)
com_ps = [dict(zip(keys, e)) for e in ret]


all_params = [base_param.copy() for _ in range(len(com_ps))] 
for i in range(len(com_ps)):
    all_params[i].update(com_ps[i])
    
print(com_ps.__len__())    
# print(all_params)

150


In [529]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(5.092s)


In [None]:
gridsearch_results = gridsearch_xgb(all_params, xg_train, xg_test)

In [None]:
gs_cv_results_sh = gridsearch_cv_xgb(data.values, watch_label_res.values, all_params, n_round=200, verbose_eval=False, n_class=10)

In [584]:
arr = np.array([[-e['test_error'] for e in gridsearch_results], [e['w_auc'] for e in gridsearch_results]], dtype=np.float32)
opt_idxs = arr.argmax(axis=1)
if opt_idxs[0] != opt_idxs[1]:
     warnings.warn(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs}。选择误差最小的模型 : {opt_idxs[0]}")

opt_idx = opt_idxs[0]

ValueError: 最小误差与最大AUC对应的模型不一致 : [93 19]

In [583]:
gridsearch_results[93]

({'test_error': 0.7289070620796754,
  'aucs': array([0.57555597, 0.58567653, 0.50347593, 0.50017473, 0.50017414,
         0.5000983 , 0.50292636, 0.50040085, 0.50878295, 0.60767447]),
  'w_auc': 2.365403849959146,
  'report': '              precision    recall  f1-score   support\n\n           0       0.33      0.19      0.24     43749\n           1       0.29      0.68      0.41    111616\n           2       0.23      0.01      0.03     62829\n           3       0.13      0.00      0.00     43872\n           4       0.15      0.00      0.00     34228\n           5       0.18      0.00      0.00     28926\n           6       0.33      0.01      0.01     25061\n           7       0.16      0.00      0.00     23400\n           8       0.31      0.02      0.04     27750\n           9       0.24      0.56      0.34     77663\n\n    accuracy                           0.27    479094\n   macro avg       0.23      0.15      0.11    479094\nweighted avg       0.24      0.27      0.18    479094\

In [588]:
opt_idx = opt_idxs[0]
opt_param = all_params[opt_idx]
print(gridsearch_results[opt_idx]['report'])

              precision    recall  f1-score   support

           0       0.33      0.19      0.24     43749
           1       0.29      0.68      0.41    111616
           2       0.23      0.01      0.03     62829
           3       0.13      0.00      0.00     43872
           4       0.15      0.00      0.00     34228
           5       0.18      0.00      0.00     28926
           6       0.33      0.01      0.01     25061
           7       0.16      0.00      0.00     23400
           8       0.31      0.02      0.04     27750
           9       0.24      0.56      0.34     77663

    accuracy                           0.27    479094
   macro avg       0.23      0.15      0.11    479094
weighted avg       0.24      0.27      0.18    479094



In [557]:
y_test.shape

(479094,)

In [None]:
gridsearch_cv_xgb(data.values, watch_label_res, all_params)

In [496]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 8
param['nthread'] = 8
param['num_class'] = 10
# param['gpu_id'] = 0
# param['tree_method'] = 'gpu_hist'

In [None]:
%%time
cv_res= xgb.cv(param, cv_data, num_boost_round=200,early_stopping_rounds=30,nfold=3, metrics='auc',show_stdv=True)
print(cv_res)

# is_share 预测

## XGBoost

### 处理数据不均衡问题

In [541]:
items = list(Counter(is_share).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / is_share.shape[0]
print(dist)

[(0, 7338705), (1, 14319)]
[[0.         0.99805264]
 [1.         0.00194736]]


In [542]:
under_ss = np.array(items)
under_ss_thresh = under_ss[1, 1] + 800
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[1, 1]
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [543]:
under_ss, over_ss

({0: 15119, 1: 14319}, {0: 15119, 1: 14319})

In [544]:
idxs = is_share == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(7338705,)

In [545]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((7323586,), (15119,))

In [546]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_sh = np.delete(is_share.values, del_idxs, axis=0)
resampled_data.shape, resampled_sh.shape

((29438, 70), (29438,))

In [547]:
# 将采样后的数据重装会DataFrame
data_sh = pd.DataFrame(resampled_data, columns=dataset.columns)
is_share_res = pd.Series(resampled_sh)
data_sh.shape, is_share_res.shape

((2395469, 70), (7353024,))

In [548]:
train_idx, test_idx = train_test_split(data_sh.index, test_size=0.2, random_state=1)
train_idx.shape, test_idx.shape

((23550,), (5888,))

### 训练模型

In [549]:
X_train_sh = data_sh.iloc[train_idx]
X_test_sh  = data_sh.iloc[test_idx]

In [431]:
y_train_sh = is_share_res.iloc[train_idx]
y_test_sh  = is_share_res.iloc[test_idx]

In [432]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(0.023s)


In [433]:
# setup parameters for xgboost
param_sh = {}
# use softmax multi-class classification
param_sh['objective'] = 'binary:hinge'
# scale weight of positive examples
param_sh['eta'] = 0.1
param_sh['max_depth'] = 6
param_sh['nthread'] = 4
param_sh['gpu_id'] = 0
param_sh['tree_method'] = 'gpu_hist'
# param_sh['min_child_weight'] = 7


watchlist = [(xg_train_sh, 'train'), (xg_test_sh, 'test')]

In [434]:
num_round = 200
t0 = time()
sh_bst_sm = xgb.train(param_sh, xg_train_sh, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-error:0.51236	test-error:0.51851
[1]	train-error:0.51236	test-error:0.51851
[2]	train-error:0.51236	test-error:0.51851
[3]	train-error:0.51236	test-error:0.51851
[4]	train-error:0.51236	test-error:0.51851
[5]	train-error:0.51134	test-error:0.51783
[6]	train-error:0.50297	test-error:0.51172
[7]	train-error:0.50157	test-error:0.51087
[8]	train-error:0.50110	test-error:0.50934
[9]	train-error:0.50081	test-error:0.50917
[10]	train-error:0.49983	test-error:0.50968
[11]	train-error:0.48858	test-error:0.49966
[12]	train-error:0.48726	test-error:0.49796
[13]	train-error:0.47091	test-error:0.48403
[14]	train-error:0.45427	test-error:0.47130
[15]	train-error:0.44832	test-error:0.46365
[16]	train-error:0.44484	test-error:0.46128
[17]	train-error:0.43257	test-error:0.44837
[18]	train-error:0.42514	test-error:0.43886
[19]	train-error:0.42064	test-error:0.43886
[20]	train-error:0.41677	test-error:0.43393
[21]	train-error:0.41350	test-error:0.43376
[22]	train-error:0.41028	test-error:0.4330

In [435]:
# get prediction
pred_sh = sh_bst_sm.predict(xg_test_sh)
error_rate = np.sum(pred_sh != y_test_sh) / y_test_sh.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.3882472826086957


In [436]:
Counter(pred_sh), Counter(y_test_sh)

(Counter({1.0: 3043, 0.0: 2845}), Counter({1: 2835, 0: 3053}))

In [437]:
report_sh = metrics.classification_report(list(y_test_sh), list(pred_sh))
print(report_sh)

              precision    recall  f1-score   support

           0       0.63      0.59      0.61      3053
           1       0.59      0.63      0.61      2835

    accuracy                           0.61      5888
   macro avg       0.61      0.61      0.61      5888
weighted avg       0.61      0.61      0.61      5888



In [438]:
aucs_sh = auc(y_test_sh.astype(np.uint8), pred_sh.astype(np.uint8), [0, 1])
aucs_sh

array([0.6125295, 0.6125295])

### 调参

In [1]:
from tools import *

In [613]:
base_param_sh = {  # 基本参数，不需要调参
    'objective': 'binary:hinge',
    'eta': 0.1,
    'nthread': 8,
#     'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist'
} 
ps_sh = {  # 需要调参的参数
    'max_depth': list(range(5, 10, 2)),
    'min_child_weight': list(range(1, 10, 2)),
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
}

com_ps_sh = list(ParameterGrid(ps_sh))


all_params_sh = [base_param_sh.copy() for _ in range(len(com_ps_sh))] 
for i in range(len(com_ps_sh)):
    all_params_sh[i].update(com_ps_sh[i])

# print(com_ps_sh)
print(all_params_sh.__len__())

120


In [550]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(24.924s)


In [None]:
gridsearch_results_sh = gridsearch_xgb(all_params_sh, xg_train_sh, xg_test_sh, num_round=150, n_class=2, verbose_eval=False)

In [None]:
gs_cv_results_sh = gridsearch_cv_xgb(data_sh.values, is_share_res.values, all_params_sh, n_round=150, verbose_eval=False, n_class=2)

In [747]:
performance = []
classes = [0, 1]
for ret in gs_cv_results_sh:
    err = sum([e['test_error'] for e in ret[1:]]) / len(ret[1:])
    aucs = np.array([e['aucs'] for e in ret[1:]])
    aucs = np.sum(aucs, axis=0) / len(ret[1:])
    
    df = None
    for e in ret[1:]:   
        t_df = report_2_df(e['report'])
        if df is not None:
            df += t_df
        else:
            df = t_df
        
    df /= len(ret[1:])
    performance.append((
        ret[0],
        {'mean_test_error': err, 'mean_auc': aucs[:2], 'report': df},
    ))

performance

[({'objective': 'binary:hinge',
   'eta': 0.1,
   'nthread': 8,
   'gpu_id': 0,
   'tree_method': 'gpu_hist',
   'gamma': 0.1,
   'max_depth': 5,
   'min_child_weight': 1},
  {'mean_test_error': 0.38277045475107274,
   'mean_auc': array([0.61766193, 0.61766193]),
   'report':               precision    recall  f1-score  support
   0              0.634445  0.601760  0.617489   3023.8
   1              0.601125  0.633564  0.616760   2863.8
   accuracy            NaN       NaN  0.617230   5887.6
   macro avg      0.617785  0.617662  0.617125   5887.6
   weighted avg   0.618238  0.617230  0.617134   5887.6}),
 ({'objective': 'binary:hinge',
   'eta': 0.1,
   'nthread': 8,
   'gpu_id': 0,
   'tree_method': 'gpu_hist',
   'gamma': 0.1,
   'max_depth': 5,
   'min_child_weight': 3},
  {'mean_test_error': 0.38178523884609417,
   'mean_auc': array([0.61874936, 0.61874936]),
   'report':               precision    recall  f1-score  support
   0              0.636618  0.599047  0.617017   3023.8
 

In [753]:
with open('greadsearch-cv-is_share.md', 'w') as f:
    for ret in performance:
        f.write(f"# {', '.join([f'{k}={v}' for k, v in ret[0].items()])}\n")
        for k, v in ret[1].items():
            is_break = '\n' if k == 'report' else ''
            f.write(f"- {k} :{is_break} {v}\n\n")
        f.write(f"{'-'*50}\n\n\n")

In [765]:
mean_test_error = np.array([e[1]['mean_test_error'] for e in performance])
mean_test_error.argmin()

48

In [766]:
performance[48]

({'objective': 'binary:hinge',
  'eta': 0.1,
  'nthread': 8,
  'gpu_id': 0,
  'tree_method': 'gpu_hist',
  'gamma': 0.3,
  'max_depth': 7,
  'min_child_weight': 7},
 {'mean_test_error': 0.3759764341197628,
  'mean_auc': array([0.62465696, 0.62465696]),
  'report':               precision    recall  f1-score  support
  0              0.643353  0.601364  0.621551   3023.8
  1              0.606309  0.647950  0.626353   2863.8
  accuracy            NaN       NaN  0.624024   5887.6
  macro avg      0.624831  0.624657  0.623952   5887.6
  weighted avg   0.625335  0.624024  0.623887   5887.6})

In [617]:
arr = np.array([[-e['test_error'] for e in gridsearch_results_sh], [e['aucs'][1] for e in gridsearch_results_sh]], dtype=np.float32)
opt_idxs_sh = arr.argmax(axis=1)
if opt_idxs_sh[0] != opt_idxs_sh[1]:
    warnings.warn(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs_sh}。选择误差最小的模型 : {opt_idxs_sh[0]}")

opt_idx_sh = opt_idxs_sh[0]
all_params_sh[opt_idx_sh]

{'objective': 'binary:hinge',
 'eta': 0.1,
 'nthread': 8,
 'gpu_id': 0,
 'tree_method': 'gpu_hist',
 'gamma': 0.3,
 'max_depth': 11,
 'min_child_weight': 9}

In [603]:
opt_idx_sh = opt_idxs_sh[0]
opt_idx_sh

146

In [619]:
print(gridsearch_results_sh[opt_idx_sh]['report'])

              precision    recall  f1-score   support

         0.0       0.54      0.44      0.49      3053
         1.0       0.50      0.60      0.54      2835

    accuracy                           0.52      5888
   macro avg       0.52      0.52      0.51      5888
weighted avg       0.52      0.52      0.51      5888



In [620]:
pd.DataFrame(gridsearch_results_sh)

Unnamed: 0,test_error,aucs,w_auc,report,model
0,0.503057,"[0.50379590202715, 0.50379590202715]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f35ba907130>
1,0.496943,"[0.508734635779073, 0.508734635779073]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32e4009580>
2,0.501189,"[0.5048543919272165, 0.5048543919272165]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f3583f4f4f0>
3,0.502717,"[0.502360357955947, 0.502360357955947]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f3574279550>
4,0.499321,"[0.5061017844072763, 0.5061017844072763]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32e4082730>
...,...,...,...,...,...
115,0.498132,"[0.5057747576472328, 0.5057747576472328]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4640>
116,0.497962,"[0.5046665869463117, 0.5046665869463118]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4850>
117,0.499490,"[0.5026636996830249, 0.5026636996830249]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a46a0>
118,0.491678,"[0.5107513874519006, 0.5107513874519005]",,precision recall f1-score ...,<xgboost.core.Booster object at 0x7f32468a4670>


## LR

In [621]:
from sklearn.linear_model import LogisticRegression

In [624]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train_sh, y_train_sh)
clf.score(X_test_sh, y_test_sh)

0.5185122282608695

# 预测

In [582]:
test = inference_dataset
test = xgb.DMatrix(test.values, enable_categorical=True)

In [591]:
inference_dataset.shape, test.num_col()

((2822180, 70), 70)

In [590]:
bst_wl = gridsearch_results[opt_idx]['model']  # wl_bst_sm
bst_sh = gridsearch_results_sh[opt_idx_sh]['model']  # sh_bst_sm

In [592]:
wl = bst_wl.predict(test)
sh = bst_sh.predict(test)
Counter(wl), Counter(Counter(sh))

(Counter({1.0: 1909048,
          0.0: 371933,
          9.0: 510899,
          8.0: 449,
          2.0: 25453,
          4.0: 286,
          3.0: 2141,
          6.0: 1804,
          5.0: 151,
          7.0: 16}),
 Counter({1.0: 1041974, 0.0: 1780206}))

In [594]:
test_df['watch_label'] = wl.astype(np.uint8)
test_df['is_share'] = sh.astype(np.uint8)
test_df.shape

(2822180, 4)

In [665]:
fn = f'../submission-{int(time())}.csv'
test_df.to_csv(fn, index=False, sep=",")
print(f"new submission saved to {fn}")

new submission saved to ../submission-1625568173.csv


In [593]:
tdf = pd.read_csv('../submission-1625404864.csv')
tdf.shape

(2822180, 4)

In [663]:
widx = test_df['watch_label'] != tdf['watch_label']
sidx = test_df['is_share'] != tdf['is_share']

In [664]:
widx.sum(), sidx.sum()

(913388, 0)

# 保存模型

In [625]:
wl_model_name = 'wl_model_v10'
sh_model_name = 'sh_model_v10'
bst_wl.save_model(wl_model_name)
bst_sh.save_model(sh_model_name)

In [636]:
def write_log(log_name, info, log_path="./"):
    import datetime
    with open(os.path.join(log_path, log_name), 'w') as log:
        log.write(f"# {datetime.datetime.now().__str__()}\n")
        if info.get('comment', False):
            log.write(f"\n## Comment: \n")
            log.write(f"{info['comment']}\n")
            
        log.write(f"\n## model name: {info['model_name']}\n")
        log.write(f"- model save path : {info['model_save_path']}\n")
        
        log.write(f"\n## Data setup\n")
        log.write(f"- dataset.shape : {dataset.shape}\n")
        log.write(f"- dataset.columns : {dataset.columns}\n")
        log.write(f"- is resample : {info['is_resample']}\n")
        log.write(f"- Traing_Data.shape (watch_label)  : {X_train.shape}\n")
        log.write(f"- Testing_Data.shape (watch_label) : {X_test.shape}\n")
        log.write(f"- Traing_Data.shape (is_share)  : {X_train_sh.shape}\n")
        log.write(f"- Testing_Data.shape (is_share) : {X_test_sh.shape}\n")
        if info.get('is_resample', False):
            log.write(f"- Resampled class distribution (watch_label): \n{Counter(resampled_wl)}\n")
            log.write(f"- Resampled class distribution (is_share): \n{Counter(resampled_sh)}\n")
            
        log.write(f"\n## Model Params\n")
        log.write(f"- model params (watch_label) : \n{info['param_wl']}\n")
        log.write(f"- model params (is_share) : \n{info['param_sh']}\n")
        
        log.write(f"\n## Model's Performance\n")
        log.write(f"- Aucs (watch_label) : {info['aucs']}\n")
        log.write(f"- Weighted Aucs (watch_label) : {info['w_auc']}\n")
        log.write(f"- Aucs (is_share) : {info['aucs_sh']}\n")
        
        log.write(f"- Classification Report (watch_label) : \n\n{info['report']}\n")
        log.write(f"- Classification Report (is_share) : \n\n{info['report_sh']}\n")
        
        log.flush()
        
    pass

In [637]:
param_wl = all_params[opt_idx]
param_sh = all_params[opt_idx_sh]

aucs = gridsearch_results[opt_idx]['aucs']
w_auc = gridsearch_results[opt_idx]['w_auc']
aucs_sh = gridsearch_results_sh[opt_idx]['aucs']

report = gridsearch_results[opt_idx]['report']
report_sh = gridsearch_results_sh[opt_idx]['report']

In [666]:
log_name = "log_v10.2.md"
info = {'is_resample': True, 'model_name': [wl_model_name, sh_model_name], 'model_save_path': os.getcwd(),
        'comment': "将submission-1625555134.csv中的is_share替换为submission-1625404864.csv中的is_share，保留watch_label。\nwatch_label的预测：基础特征，is_share的预测：基础特征。\n此次生成的提交是：submission-1625568173.csv。官方测评得分：xxx😐",
        'param_wl': param_wl, 'param_sh': param_sh, 'aucs': aucs, 'w_auc': w_auc, 'aucs_sh': aucs_sh, 
        'report': report, 'report_sh': report_sh}
write_log(log_name, info)

# 服务器间同步文件

## 推向Digix服务器

In [648]:
!scp ./models.ipynb digix@49.123.120.71:/home/digix/digix/Models/models.ipynb 

models.ipynb                                  100%  132KB   9.2MB/s   00:00    


In [599]:
!scp ./log_*.md digix@49.123.120.71:/home/digix/digix/Models/

ssh: connect to host 49.123.120.71 port 22: No route to host
lost connection


In [472]:
!scp ../explore-data.ipynb digix@49.123.120.71:/home/digix/digix/explore-data.ipynb 

explore-data.ipynb                            100%  306KB  10.6MB/s   00:00    


In [650]:
!scp ../2021_3_data/traindata/video_features_data/video_features.jay digix@49.123.120.71:/home/digix/digix/dataset/new_video_features.jay

video_features.jay                            100% 9035KB  11.1MB/s   00:00    


## 从Digix服务器拉数据

In [1]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/LightGBM.ipynb ./LightGBM.ipynb

LightGBM.ipynb                                100%   71KB   2.2MB/s   00:00    


In [2]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/feature_engineering.ipynb ./feature_engineering.ipynb

scp: /home/digix/digix/Models/feature_engineering.ipynb: No such file or directory


In [3]:
!scp -r digix@49.123.120.71:/home/digix/digix/Models/Feature_Engineering/  ./

utils.py                                      100% 3860     2.6MB/s   00:00    
data_analysis.ipynb                           100% 6566KB  11.2MB/s   00:00    
__init__.py                                   100%    0     0.0KB/s   00:00    
__init__.cpython-36.pyc                       100%  139   128.3KB/s   00:00    
utils.cpython-36.pyc                          100% 4120     2.6MB/s   00:00    
video_data.ipynb                              100%   55KB   1.7MB/s   00:00    
user_data-checkpoint.ipynb                    100%  202KB  10.1MB/s   00:00    
data_analysis-checkpoint.ipynb                100% 6554KB  11.0MB/s   00:00    
utils-checkpoint.py                           100% 3860     2.4MB/s   00:00    
video_data-checkpoint.ipynb                   100%   17KB   1.4MB/s   00:00    
user_data.ipynb                               100%  202KB  10.3MB/s   00:00    


In [168]:
!scp -r digix@49.123.120.71:/home/digix/digix/dataset/traindata/video_features_data/video_status.csv ../2021_3_data/traindata/video_features_data/

video_status.csv                              100% 2008KB   9.1MB/s   00:00    
user_status.csv                               100%  138MB   9.3MB/s   00:14    


In [279]:
!scp -r digix@49.123.120.71:/home/digix/digix/dataset/traindata/user_features_data/user_status.csv ../2021_3_data/traindata/user_features_data/

user_status.csv                               100%  168MB  11.2MB/s   00:14    
