In [267]:
import numpy as np
from sklearn import metrics
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from time import time
import pandas as pd
import xgboost as xgb
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
import datatable as dt

# 给定预测标签，计算AUC
使用OVR的策略计算每个类别的AUC
过程：
- 选择类别i作为正类，其他类别作为负类
- 将真实标签中不等于i的标记为0，等于i的标记为1
- 将预测标签中不等于i的标记为0，等于ide标记为1
- 计算混淆矩阵
- 计算(fpr, tpr)
- 计算AUC

In [205]:
y = np.random.randint(0, 10, 100)
p = np.random.randint(0, 10, 100)

In [30]:
def confusion_matrix(label, predict, n):
    """
    计算混淆矩阵
    :param label: 标签，np.array类型。形状可以是(n_sample,) 或者 (n_sample, n_classes)，当为第二种形状时可以表示多标签分类的情况
    :param predict: 预测值，与 `label` 同理
    :param n: 类别数目
    :return: 混淆矩阵，np.array类型。shape 为 (n, n)。$cm_{ij}$表示真实标签为 $i$，预测标签为 $j$ 的样本个数
    """
    k = (label >= 0) & (label < n)
    # bincount()函数用于统计数组内每个非负整数的个数
    # 详见 https://docs.scipy.org/doc/numpy/reference/generated/numpy.bincount.html
    return np.bincount(n * label[k].astype(int) + predict[k], minlength=n ** 2).reshape(n, n)


def auc(y, p, classes):
    """
    给定真实标签和预测标签，计算每个类别的auc值。实际只算出了roc曲线上一个点，即一个(fpr, tpr)，再并上(0, 0)和(1, 1)来计算auc
    :param y: 标签，np.array类型
    :param p: 预测标签，np.array类型
    :param classes: 类别，list-like，表示有哪些类别
    """
    all_aucs = np.zeros(len(classes))
    for i, c in enumerate(classes):
        _y = np.zeros_like(y)
        _y[y==c] = 1
        _y[y!=c] = 0
        _p = np.zeros_like(p)
        _p[p==c] = 1
        _p[p!=c] = 0
#         print(_y, _p)
        cm = confusion_matrix(_y, _p, 2)
#         print(cm)
        tpr = (cm[0, 0] / (cm[0, 0] + cm[0, 1])) if (cm[0, 0] + cm[0, 1]) != 0 else 0
        fpr = (cm[1, 0] / (cm[1, 0] + cm[1, 1])) if (cm[1, 0] + cm[1, 1]) != 0 else 0
        tpr = [0, tpr, 1]
        fpr = [0, fpr, 1]
        auc = metrics.auc(fpr, tpr)
        all_aucs[i] = auc
        if _y.sum() == 0 or _p.sum() == 0:
            all_aucs[i] = 0
    return all_aucs

In [12]:
classes = list(range(10))
weights = np.arange(0, 1, 0.1)
all_aucs = auc(y, p, classes)

weighted_auc = (all_aucs * weights).sum()
print(f"{all_aucs}\n{weighted_auc}")

[0.5298913  0.65555556 0.52304147 0.50747508 0.52445652 0.58219623
 0.57264957 0.46842105 0.53379416 0.50795756]
2.3789687141650595


In [13]:
classes = list(range(2))
y = np.array([0, 0, 1, 1])
p = np.array([0, 1, 0, 1])
all_aucs = auc(y, p, classes)

print(f"{all_aucs}")

[0.5 0.5]


# 加载数据
训练数据加载过程：
1. 分别加载处理好的用户特征和视频特征，以及整合的用户历史行为数据；
2. 从用户历史行为数据中筛掉在视频特征中没出现过的video_id；
3. 将行为数据中的user_id、video_id替换为对应用户/视频的特征
4. 根据不同的任务划分为`watch_label`、`is_share`的数据集

推断时，类似于上述过程拼接数据。

In [799]:
from datatable import join

def load_npz(path):
    npz = np.load(path, allow_pickle=True)
    return npz


def load_table(path, ftype="csv", data_name="data", column_name="columns"):
    if ftype == "npz":
        npz = load_npz(path)
        tab = pd.DataFrame(npz[data_name], columns=column_name)
    elif ftype == "jay":
        tab = dt.fread(path)
    elif ftype == "csv":
        tab = pd.read_csv(path)
        
    return tab

        
def merge_user_video_action(user, video, action, return_others=False):
    """
    将用户特征矩阵、视频特征矩阵、行为拼接起来
    """
    tab_user = dt.fread(user) if isinstance(user, str) else dt.Frame(user)
    tab_video = dt.fread(video) if isinstance(video, str) else dt.Frame(video)
    tab_act = dt.fread(action) if isinstance(action, str) else dt.Frame(action)
    
    tab_user.key = 'user_id'
    tab_act_user = tab_act[:, :, join(tab_user)]
    tab_video.key = 'video_id'
    tab_act_user_video = tab_act_user[:, :, join(tab_video)]
    
    if not return_others:
        return tab_act_user_video 
    else:
        return tab_act_user_video, {"user": tab_user, "video": tab_video, "action": tab_act}



def load_train_test_data(path=None, pre_merged=True, return_others=False, **kwargs):
    """
    读取保存的训练数据
    """ 
    if pre_merged:
        assert path is not None
        tab = dt.fread(path)
#     del tab[:, ['video_id', 'user_id']]
        return tab
    else:
        p_user = kwargs.get('p_user')
        p_video = kwargs.get('p_video')
        p_action = kwargs.get('p_action')
        
        if return_others:
            tab, others = merge_user_video_action(p_user, p_video, p_action, return_others=True)
            return tab, others
        else:
            tab = merge_user_video_action(p_user, p_video, p_action)
            return tab


def read_npz_to_df(path, data_name='data', column_name='columns'):
    npz = np.load(path, allow_pickle=True)
    df = pd.DataFrame(npz[data_name], columns=npz[column_name])
    
    return df

## 读取数据

In [2]:
base_dir = "../2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

### 通过 .npz 读取数据

#### 单表读取后合并

In [795]:
%%time
# 单独读取每个文件再进行合并
user_df = read_npz_to_df(os.path.join(train_data_dir, "user_features_data/user_features.npz"), data_name='features', column_name='columns')
video_df = read_npz_to_df(os.path.join(train_data_dir, "video_features_data/video_features.npz"), data_name='features')
action_df = read_npz_to_df(os.path.join(train_data_dir, "all_actions.npz"), data_name='data')
action_df

CPU times: user 3.41 s, sys: 3.35 s, total: 6.76 s
Wall time: 12.2 s


Unnamed: 0,user_id,video_id,is_watch,is_share,watch_label
0,4239342,28149,1,0,2
1,3577036,115,1,0,0
2,5527504,3636,1,0,5
3,1117889,12968,1,0,0
4,1117889,860,1,0,4
...,...,...,...,...,...
7353019,1073806,39040,1,0,0
7353020,1073806,30641,1,0,0
7353021,1073806,29794,1,0,8
7353022,1673305,12968,1,0,0


In [797]:
# 因为将字符串保存到 .npz时会使dtype为object，重新读回DataFrame时各个列的数据类型均为 object，所以先转换类型
dtypes = dict(zip(video_df.columns, [np.float32] * video_df.shape[1]))
dtypes.update({'video_name': np.str})
video_df = video_df.astype(dtypes)

In [802]:
%%time
# 合并各个表
df_train = merge_user_video_action(user_df, video_df, action_df)
df_train.shape

CPU times: user 1min 29s, sys: 5.88 s, total: 1min 35s
Wall time: 40.1 s


(7353024, 76)

In [805]:
np.savez(os.path.join(train_data_dir, "train.npz"), data=df_train.to_pandas().values, columns=df_train.to_pandas().columns.tolist())

In [777]:
%%time
test_df = load_table(os.path.join(test_data_dir, "test.csv"), ftype="csv")

CPU times: user 626 ms, sys: 0 ns, total: 626 ms
Wall time: 721 ms


In [779]:
%%time
df_test = merge_user_video_action(user_df, video_df, test_df)
df_test.shape

CPU times: user 3min 8s, sys: 12.8 s, total: 3min 20s
Wall time: 51.2 s


(2822180, 73)

#### 读取合并好的数据

In [806]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.npz")
df_train = read_npz_to_df(path, data_name='data')
df_train.shape

CPU times: user 3min 2s, sys: 38.4 s, total: 3min 40s
Wall time: 3min 41s


(7353024, 76)

In [810]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.npz")
df_test = read_npz_to_df(path, data_name='data')
df_test.shape

CPU times: user 1min 37s, sys: 36.9 s, total: 2min 14s
Wall time: 5min 49s


(2822180, 73)

### 通过 .jay 文件读取

#### 单表读取后合并

In [811]:
%%time
## 使用datatable 加载训练数据
p_user = os.path.join(train_data_dir, "user_features_data/user_features.jay")
p_video = os.path.join(train_data_dir, "video_features_data/video_features.jay")
p_act = os.path.join(train_data_dir, "all_actions.jay")

df_train, others = load_train_test_data(None, pre_merged=False, return_others=True,
                           **{"p_user": p_user, "p_video": p_video, "p_action": p_act})
user_df = others['user']
video_df = others['video']
action_df = others['action']
df_train.shape

CPU times: user 1min 9s, sys: 8.9 s, total: 1min 18s
Wall time: 33.7 s


(7353024, 76)

In [812]:
%%time
p_user = os.path.join(train_data_dir, "user_features_data/user_features.jay")
p_video = os.path.join(train_data_dir, "video_features_data/video_features.jay")
p_act = os.path.join(test_data_dir, "test.csv")

path = os.path.join(test_data_dir, "test.jay")
kwargs = {"p_user": p_user, "p_video": p_video, "p_action": p_act}

df_test, others = load_train_test_data(None, pre_merged=False, return_others=True, **kwargs)
test_df = others['action']
df_test.shape

CPU times: user 1min 36s, sys: 9.11 s, total: 1min 45s
Wall time: 26 s


(2822180, 73)

In [814]:
action_df = action_df.to_pandas()
user_df = user_df.to_pandas()
video_df = video_df.to_pandas()
test_df = test_df.to_pandas()

#### 读取合并好后的数据

In [None]:
%%time
# 直接读取保存好的合并后的训练数据
path = os.path.join(train_data_dir, "train.jay")
tab_train = load_train_test_data(path, pre_merged=True)
tab_train.shape

In [None]:
%%time
# 直接读取保存好的合并后的测试数据
path = os.path.join(test_data_dir, "test.jay")
tab_train = load_train_test_data(path, pre_merged=True)
tab_test.shape

## 处理训练数据
可在此做一些预处理：
- 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
- 删除多余的列
- 调整列的顺序
- 改变列的数据类型


In [817]:
if isinstance(df_train, dt.Frame):
    df_train = df_train.to_pandas()
if isinstance(df_test, dt.Frame):
    df_test = df_test.to_pandas()

In [823]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Data columns (total 76 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int64  
 1   video_id             int64  
 2   is_watch             int64  
 3   is_share             int64  
 4   watch_label          int64  
 5   age_0                float64
 6   age_1                float64
 7   age_2                float64
 8   age_3                float64
 9   age_4                float64
 10  age_5                float64
 11  age_6                float64
 12  age_7                float64
 13  gender_0             float64
 14  gender_1             float64
 15  gender_2             float64
 16  gender_3             float64
 17  city_level_0         float64
 18  city_level_1         float64
 19  city_level_2         float64
 20  city_level_3         float64
 21  city_level_4         float64
 22  city_level_5         float64
 23  city_level_6         float64
 24

In [825]:
# 删除 video_name、is_watch 列
df_train.drop(['video_name', 'is_watch'], axis=1, inplace=True)

In [826]:
# 删除 video_id、user_id列
user_video_action.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [815]:
action_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7353024 entries, 0 to 7353023
Data columns (total 5 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int64
 1   video_id     int64
 2   is_watch     int64
 3   is_share     int64
 4   watch_label  int64
dtypes: int64(5)
memory usage: 280.5 MB


In [816]:
# 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
idx1 = pd.Index(action_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'])
not_exists = idx1.difference(idx2)
not_exists

Int64Index([   15,   144,   428,   497,   876,  1174,  2127,  2199,  2334,
             3153,
            ...
            48069, 48269, 48343, 48626, 49103, 49241, 49404, 49419, 49793,
            50337],
           dtype='int64', length=243)

In [834]:
%%time
n = 0
for vid in not_exists:
    tn = (df_train['video_id'] == vid).sum()
    #     action_df = action_df[action_df['video_id'] != vid]
#     action_df['video_id'].replace(vid, np.nan, inplace=True)
    n += tn
print(n)

45006
CPU times: user 3.58 s, sys: 0 ns, total: 3.58 s
Wall time: 3.58 s


In [19]:
%%time
t0 = time()
n = 0
for vid in not_exists:
    tn = (action_df['video_id'] == vid).sum()
#     action_df = action_df[action_df['video_id'] != vid]
    action_df['video_id'].replace(vid, np.nan, inplace=True)
    n += tn
action_df.dropna(axis=0, inplace=True)
print(f"在视频特征中不存在的video_id在行为数据集中出现的次数 = {n}\t\t(cost {time() - t0:.3f}s)")

在视频特征中不存在的video_id在行为数据集中出现的次数 = 45006		(cost 10.650s)


In [827]:
dataset = df_train

## 处理测试数据

In [830]:
if 'test_df' not in dir():
    test_df = pd.read_csv(os.path.join(test_data_dir, "test.csv"))

In [831]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 2 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int32
 1   video_id  int32
dtypes: int32(2)
memory usage: 21.5 MB


In [835]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 73 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int32  
 1   video_id             int32  
 2   age_0                float64
 3   age_1                float64
 4   age_2                float64
 5   age_3                float64
 6   age_4                float64
 7   age_5                float64
 8   age_6                float64
 9   age_7                float64
 10  gender_0             float64
 11  gender_1             float64
 12  gender_2             float64
 13  gender_3             float64
 14  city_level_0         float64
 15  city_level_1         float64
 16  city_level_2         float64
 17  city_level_3         float64
 18  city_level_4         float64
 19  city_level_5         float64
 20  city_level_6         float64
 21  city_level_7         float64
 22  device_name_0        float64
 23  device_name_1        float64
 24

In [836]:
# 删除 video_name 列
df_test.drop('video_name', axis=1, inplace=True)

In [838]:
# 删除 video_id、user_id 列
df_test.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [832]:
# 测试数据集中存在video_id没有在视频特征中出现
idx1 = pd.Index(test_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'].unique())
non_exists = idx1.difference(idx2)
non_exists

Int64Index([   15,   144,   428,   497,   876,  1174,  1589,  1906,  2127,
             2199,
            ...
            47945, 48069, 48269, 48343, 48626, 49241, 49404, 49419, 49793,
            50337],
           dtype='int64', length=276)

In [833]:
%%time
t0 = time()
n = 0
for vid in not_exists:
    tn = (test_df['video_id'] == vid).sum()
#     action_df = action_df[action_df['video_id'] != vid]
    n += tn

print(f"在视频特征中不存在的video_id在测试数据集中出现的次数 = {n}\t\t(cost {time() - t0:.3f}s)")

在视频特征中不存在的video_id在测试数据集中出现的次数 = 22038		(cost 0.828s)
CPU times: user 373 ms, sys: 457 ms, total: 830 ms
Wall time: 829 ms


In [839]:
inference_dataset = df_test

In [840]:
inference_dataset.shape

(2822180, 70)

# XGBoost

In [None]:
def train_xgb(X_train, y_train, params):
    xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
    xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 5
    # train xgb
    bst = xgb.train(_param, xg_train, num_round, watchlist)
    # get prediction
    pred = bst.predict(xg_test)
    error_rate = np.sum(pred != y_test) / test_y.shape[0]
    print('Test error using softmax = {}'.format(error_rate))

    # do the same thing again, but output probabilities
    param['objective'] = 'multi:softprob'
    bst = xgb.train(param, xg_train, num_round, watchlist)
    # Note: this convention has been changed since xgboost-unity
    # get prediction, this is in 1D array, need reshape to (ndata, nclass)
    pred_prob = bst.predict(xg_test).reshape(y_test.shape[0], 6)
    pred_label = np.argmax(pred_prob, axis=1)
    error_rate = np.sum(pred_label != y_test) / y_test.shape[0]
    print('Test error using softprob = {}'.format(error_rate))


In [841]:
# 准备数据
watch_label = dataset.pop('watch_label').astype(np.uint8)
is_share = dataset.pop('is_share').astype(np.uint8)

## watch_label 预测

### 处理数据不均衡问题

In [842]:
items = list(Counter(watch_label).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / watch_label.shape[0]
print(dist)

[(0, 5176743), (1, 557421), (2, 314107), (3, 219188), (4, 172404), (5, 143001), (6, 125092), (7, 117749), (8, 138798), (9, 388521)]
[[0.         0.70402912]
 [1.         0.0758084 ]
 [2.         0.04271807]
 [3.         0.02980923]
 [4.         0.02344668]
 [5.         0.01944792]
 [6.         0.01701232]
 [7.         0.01601368]
 [8.         0.01887632]
 [9.         0.05283826]]


In [843]:
under_ss = np.array(items)
under_ss_thresh = under_ss[3, 1]  # 设置每个类别样本数目的上限
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[2, 1]  # 设置每个类别样本数据的下限
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [844]:
under_ss, over_ss

({0: 219188,
  1: 219188,
  2: 219188,
  3: 219188,
  4: 172404,
  5: 143001,
  6: 125092,
  7: 117749,
  8: 138798,
  9: 219188},
 {0: 219188,
  1: 219188,
  2: 219188,
  3: 219188,
  4: 219188,
  5: 219188,
  6: 219188,
  7: 219188,
  8: 219188,
  9: 219188})

In [845]:
idxs = watch_label == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(5176743,)

In [846]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留，注意replace参数，为True时会重复采样
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((4957555,), (219188,))

In [445]:
Counter(watch_label)

Counter({2: 312266,
         0: 5146219,
         5: 142012,
         4: 171292,
         1: 554320,
         9: 385082,
         3: 217820,
         8: 137834,
         7: 116928,
         6: 124245})

In [847]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_wl = np.delete(watch_label.values, del_idxs, axis=0)
resampled_data.shape, resampled_wl.shape

((2395469, 72), (2395469,))

In [447]:
Counter(resampled_wl)

Counter({2: 312266,
         5: 142012,
         4: 171292,
         1: 554320,
         9: 385082,
         3: 217820,
         0: 385082,
         8: 137834,
         7: 116928,
         6: 124245})

In [273]:
# 速度太慢，难以忍受！
nm  = TomekLinks(sampling_strategy=under_ss)
smt = SMOTE(sampling_strategy=over_ss)

In [None]:
t0 = time()
X_r, y_r = nm.fit_resample(resampled_data, pd.Series(resampled_wl))
print(f"Under Sampling finished ...\t\t({time()-t0:.3f}s)")

In [None]:
X_r, y_r = smt.fit_resample(X_r, y_r)

In [848]:
# 将采样后的数据重装回 DataFrame
data = pd.DataFrame(resampled_data, columns=dataset.columns)
watch_label_res = pd.Series(resampled_wl)
data.shape, watch_label.shape

((2395469, 72), (7353024,))

In [849]:
train_idx, test_idx = train_test_split(data.index, test_size=0.2, random_state=0)
train_idx.shape, test_idx.shape

((1916375,), (479094,))

### 训练模型

In [850]:
X_train = data.iloc[train_idx]
X_test  = data.iloc[test_idx]

In [851]:
y_train = watch_label_res.iloc[train_idx]
y_test  = watch_label_res.iloc[test_idx]

In [852]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1916375, 72), (1916375,), (479094, 72), (479094,))

In [853]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(1.896s)


In [854]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 11
param['min_child_weight'] = 7
param['nthread'] = 8
param['num_class'] = 10
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
# param['scale_pos_weight'] = 2
watchlist = [(xg_train, 'train'), (xg_test, 'test')]

In [855]:
num_round = 200
t0 = time()
wl_bst_sm = xgb.train(param, xg_train, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-mlogloss:2.27337	test-mlogloss:2.27422
[1]	train-mlogloss:2.24917	test-mlogloss:2.25083
[2]	train-mlogloss:2.22867	test-mlogloss:2.23111
[3]	train-mlogloss:2.21100	test-mlogloss:2.21427
[4]	train-mlogloss:2.19564	test-mlogloss:2.19971
[5]	train-mlogloss:2.18217	test-mlogloss:2.18702
[6]	train-mlogloss:2.17038	test-mlogloss:2.17598
[7]	train-mlogloss:2.16001	test-mlogloss:2.16637
[8]	train-mlogloss:2.15074	test-mlogloss:2.15787
[9]	train-mlogloss:2.14235	test-mlogloss:2.15026
[10]	train-mlogloss:2.13481	test-mlogloss:2.14346
[11]	train-mlogloss:2.12806	test-mlogloss:2.13745
[12]	train-mlogloss:2.12209	test-mlogloss:2.13222
[13]	train-mlogloss:2.11661	test-mlogloss:2.12746
[14]	train-mlogloss:2.11156	test-mlogloss:2.12319
[15]	train-mlogloss:2.10697	test-mlogloss:2.11935
[16]	train-mlogloss:2.10281	test-mlogloss:2.11591
[17]	train-mlogloss:2.09906	test-mlogloss:2.11288
[18]	train-mlogloss:2.09562	test-mlogloss:2.11014
[19]	train-mlogloss:2.09247	test-mlogloss:2.10772
[20]	train

In [856]:
# get prediction
pred = wl_bst_sm.predict(xg_test)
# pred = pred.astype(np.uint8)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.7300216658943757


In [857]:
weights = np.arange(0, 1, 0.1)
aucs = auc(y_test.astype(np.uint8), pred.astype(np.uint8), np.arange(param['num_class']))
# aucs[aucs == 0.5] = 0
w_aucs = (aucs * weights).sum()
aucs, w_aucs

(array([0.57749827, 0.58572151, 0.50378435, 0.50053127, 0.50040545,
        0.50016001, 0.50280881, 0.50046443, 0.50977498, 0.60535577]),
 2.364381155534087)

In [858]:
report = metrics.classification_report(list(y_test), list(pred))

In [859]:
print(report)

              precision    recall  f1-score   support

           0       0.33      0.20      0.25     43915
           1       0.29      0.67      0.40    111680
           2       0.19      0.02      0.04     62504
           3       0.12      0.00      0.01     43621
           4       0.12      0.00      0.00     34605
           5       0.11      0.00      0.00     28676
           6       0.27      0.01      0.01     25168
           7       0.12      0.00      0.00     23377
           8       0.29      0.02      0.04     27486
           9       0.24      0.56      0.33     78062

    accuracy                           0.27    479094
   macro avg       0.21      0.15      0.11    479094
weighted avg       0.22      0.27      0.18    479094



### 调参

In [563]:
from xgboost import XGBClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

In [549]:
base_param = {  # 基本参数，不需要调参
    'objective': 'multi:softmax',
    'eta': 0.1,
    'nthread': 8,
    'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist'
} 
ps = {  # 需要调参的参数
    'max_depth': list(range(5, 14, 2)),
    'min_child_weight': list(range(1, 10, 2))
}

tmp = []
keys = list(ps.keys())
k = keys[0]
tmp.extend([[e] for e in ps[k].copy()])

# 将需要调参的参数进行组合，即笛卡尔乘积。类似于sklearn中的 ParameterGrid
for k in keys[1:]:
    v = ps[k].copy()
    l = len(tmp)
    tmp = [tmp[i%l].copy() for i in range(len(v) * len(tmp))]
    for i, e in enumerate(tmp):
        e.append(v[i // l])
        
com_ps = [dict(zip(keys, e)) for e in tmp]
# print(com_ps)
all_params = [base_param.copy() for _ in range(len(com_ps))] 
for i in range(len(com_ps)):
    all_params[i].update(com_ps[i])
    
print(all_params)

[{'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 5, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 7, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 9, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 11, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 5, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 3}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 7, 'nthread': 8, 'num_class': 10, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 3}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 9, 'nthread': 8, '

In [None]:
results = []
for i, p in enumerate(all_params):
    num_round = 250
    t0 = time()
    bst = xgb.train(p, xg_train, num_round, watchlist)
    print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")
    
    # get prediction
    pred = bst.predict(xg_test)
    # pred = pred.astype(np.uint8)
    error_rate = np.sum(pred != y_test) / y_test.shape[0]
    print('Test error using softmax = {}'.format(error_rate))
    
    weights = np.arange(0, 1, 0.1)
    aucs = auc(y_test.astype(np.uint8), pred.astype(np.uint8), np.arange(param['num_class']))
    # aucs[aucs == 0.5] = 0
    w_aucs = (aucs * weights).sum()
    aucs, w_aucs
    
    rep = metrics.classification_report(list(y_test), list(pred))
    results.append({
        'test_error': error_rate, 
        'aucs': aucs,
        'w_auc': w_aucs,
        'report': rep
        'model': bst
    })
    
    print(f"{i} : {num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

In [601]:
arr = np.array([[-e['test_error'] for e in results], [e['w_auc'] for e in results]], dtype=np.float32)
opt_idxs = arr.argmax(axis=1)
if opt_idxs[0] != opt_idxs[1]:
    raise ValueError(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs}")
else:
    opt_idx = opt_idxs[0]

In [533]:
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2), 
}
param_test2 = {
'max_depth':[4,5,6],
'min_child_weight':[4,5,6]
}

clf = XGBClassifier(learning_rate =0.1, n_estimators=200, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
        colsample_bytree=0.8, objective= 'multi:logistic', nthread=8, gpu_id=0, seed=27)

In [None]:
%%time
gsearch1 = GridSearchCV(estimator=clf, param_grid = param_test1, scoring='roc_auc_ovr', n_jobs=8, cv=5, verbose=3)
gsearch1.fit(data, watch_label_res)

In [540]:
gs1 = gsearch1

In [542]:
bst_clf = gs1.best_estimator_

In [545]:
wl_pred = bst_clf.predict(inference_dataset)

In [494]:
cv_data = xgb.DMatrix(data.values, label=watch_label_res.values, enable_categorical=True)

In [496]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 8
param['nthread'] = 8
param['num_class'] = 10
# param['gpu_id'] = 0
# param['tree_method'] = 'gpu_hist'

In [None]:
%%time
cv_res= xgb.cv(param, cv_data, num_boost_round=200,early_stopping_rounds=30,nfold=3, metrics='auc',show_stdv=True)
print(cv_res)

## is_share 预测

### 处理数据不均衡问题

In [900]:
items = list(Counter(is_share).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / is_share.shape[0]
print(dist)

[(0, 7338705), (1, 14319)]
[[0.         0.99805264]
 [1.         0.00194736]]


In [948]:
under_ss = np.array(items)
under_ss_thresh = under_ss[1, 1] + 800
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[1, 1]
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [949]:
under_ss, over_ss

({0: 15119, 1: 14319}, {0: 15119, 1: 14319})

In [950]:
idxs = is_share == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(7338705,)

In [951]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((7323586,), (15119,))

In [952]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_sh = np.delete(is_share.values, del_idxs, axis=0)
resampled_data.shape, resampled_sh.shape

((29438, 72), (29438,))

In [953]:
# 将采样后的数据重装会DataFrame
data_sh = pd.DataFrame(resampled_data, columns=dataset.columns)
is_share_res = pd.Series(resampled_sh)
data.shape, is_share.shape

((2395469, 72), (7353024,))

In [954]:
train_idx, test_idx = train_test_split(data_sh.index, test_size=0.2, random_state=1)
train_idx.shape, test_idx.shape

((23550,), (5888,))

### 训练模型

In [955]:
X_train_sh = data_sh.iloc[train_idx]
X_test_sh  = data_sh.iloc[test_idx]

In [956]:
y_train_sh = is_share_res.iloc[train_idx]
y_test_sh  = is_share_res.iloc[test_idx]

In [957]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(0.022s)


In [977]:
# setup parameters for xgboost
param_sh = {}
# use softmax multi-class classification
param_sh['objective'] = 'binary:hinge'
# scale weight of positive examples
param_sh['eta'] = 0.1
param_sh['max_depth'] = 6
param_sh['nthread'] = 4
param_sh['gpu_id'] = 0
param_sh['tree_method'] = 'gpu_hist'
# param_sh['min_child_weight'] = 7


watchlist = [(xg_train_sh, 'train'), (xg_test_sh, 'test')]

In [None]:
num_round = 300
t0 = time()
sh_bst_sm = xgb.train(param_sh, xg_train_sh, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

In [979]:
# get prediction
pred_sh = sh_bst_sm.predict(xg_test_sh)
error_rate = np.sum(pred_sh != y_test_sh) / y_test_sh.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.37058423913043476


In [980]:
Counter(pred_sh), Counter(y_test_sh)

(Counter({0.0: 2882, 1.0: 3006}), Counter({0: 3058, 1: 2830}))

In [981]:
report_sh = metrics.classification_report(list(y_test_sh), list(pred_sh))
print(report_sh)

              precision    recall  f1-score   support

           0       0.65      0.61      0.63      3058
           1       0.61      0.65      0.63      2830

    accuracy                           0.63      5888
   macro avg       0.63      0.63      0.63      5888
weighted avg       0.63      0.63      0.63      5888



In [982]:
aucs_sh = auc(y_test_sh.astype(np.uint8), pred_sh.astype(np.uint8), [0, 1])
aucs_sh

array([0.63001847, 0.63001847])

### 调参

In [567]:
base_param_sh = {  # 基本参数，不需要调参
    'objective': 'binary:hinge',
    'eta': 0.1,
    'nthread': 8,
#     'num_class': 10,
    'gpu_id': 0,
    'tree_method': 'gpu_hist'
} 
ps_sh = {  # 需要调参的参数
    'max_depth': list(range(5, 13, 2)),
    'min_child_weight': list(range(1, 10, 2))
}

com_ps_sh = list(ParameterGrid(ps_sh))
print(com_ps_sh)

all_params_sh = [base_param_sh.copy() for _ in range(len(com_ps_sh))] 
for i in range(len(com_ps_sh)):
    all_params_sh[i].update(com_ps_sh[i])
    
print(all_params_sh)

[{'max_depth': 5, 'min_child_weight': 1}, {'max_depth': 5, 'min_child_weight': 3}, {'max_depth': 5, 'min_child_weight': 5}, {'max_depth': 5, 'min_child_weight': 7}, {'max_depth': 7, 'min_child_weight': 1}, {'max_depth': 7, 'min_child_weight': 3}, {'max_depth': 7, 'min_child_weight': 5}, {'max_depth': 7, 'min_child_weight': 7}, {'max_depth': 9, 'min_child_weight': 1}, {'max_depth': 9, 'min_child_weight': 3}, {'max_depth': 9, 'min_child_weight': 5}, {'max_depth': 9, 'min_child_weight': 7}, {'max_depth': 11, 'min_child_weight': 1}, {'max_depth': 11, 'min_child_weight': 3}, {'max_depth': 11, 'min_child_weight': 5}, {'max_depth': 11, 'min_child_weight': 7}]
[{'objective': 'binary:hinge', 'eta': 0.1, 'max_depth': 5, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 1}, {'objective': 'binary:hinge', 'eta': 0.1, 'max_depth': 5, 'nthread': 8, 'gpu_id': 0, 'tree_method': 'gpu_hist', 'min_child_weight': 3}, {'objective': 'binary:hinge', 'eta': 0.1, 'max_depth': 5, 'nthread

In [None]:
results_sh = []
num_round = 250
for i, p in enumerate(all_params):
    t0 = time()
    bst = xgb.train(p, xg_train_sh, num_round, watchlist)
    
    # get prediction
    pred = bst.predict(xg_test_sh)
    # pred = pred.astype(np.uint8)
    error_rate = np.sum(pred != y_test_sh) / y_test_sh.shape[0]
    
    aucs = auc(y_test_sh.astype(np.uint8), pred.astype(np.uint8), np.arange(2))
    
    rep = metrics.classification_report(list(y_test_sh), list(pred))
    results_sh.append({
        'test_error': error_rate, 
        'aucs': aucs,
        'report': rep,
        'model': bst
    })
    
    print(f"{i} : {num_round}-rounds Training finished error_rate={error_rate}  aucs={aucs}...\t\t({time()-t0:.3f}s)")

In [876]:
arr = np.array([[-e['test_error'] for e in results], [e['w_auc'] for e in results]], dtype=np.float32)
opt_idxs_sh = arr.argmax(axis=1)
if opt_idxs_sh[0] != opt_idxs_sh[1]:
    raise ValueError(f"最小误差与最大AUC对应的模型不一致 : {opt_idxs_sh}")
else:
    opt_idx_sh = opt_idxs_sh[0]
all_params_sh[opt_idx_sh]

{'objective': 'binary:hinge',
 'eta': 0.1,
 'max_depth': 11,
 'nthread': 8,
 'gpu_id': 0,
 'tree_method': 'gpu_hist',
 'min_child_weight': 7}

15

In [575]:
pd.DataFrame(results_sh)['aucs']

0       [0.621498838085251, 0.621498838085251]
1     [0.6191077482441057, 0.6191077482441057]
2     [0.6077097920740165, 0.6077097920740165]
3     [0.6092416694226004, 0.6092416694226004]
4     [0.6235535033609287, 0.6235535033609287]
5     [0.6109579445337427, 0.6109579445337427]
6      [0.609377418608228, 0.6093774186082281]
7     [0.6066783439631294, 0.6066783439631293]
8     [0.6226984063414158, 0.6226984063414157]
9     [0.6128707795516763, 0.6128707795516763]
10    [0.6130186908815368, 0.6130186908815368]
11    [0.6000360933128845, 0.6000360933128845]
12    [0.6218615525382517, 0.6218615525382518]
13    [0.6121879181504891, 0.6121879181504892]
14    [0.6121575127899075, 0.6121575127899075]
15    [0.6054038372425004, 0.6054038372425004]
Name: aucs, dtype: object

## 预测

In [985]:
test = inference_dataset
test = xgb.DMatrix(test.values, enable_categorical=True)

In [986]:
inference_dataset.shape, test.num_col()

((2822180, 70), 70)

In [988]:
bst_wl = wl_bst_sm
bst_sh = sh_bst_sm  # results_sh[4]['model']

wl = bst_wl.predict(test)
sh = bst_sh.predict(test)
Counter(wl), Counter(Counter(sh))

(Counter({0.0: 1859022,
          2.0: 339419,
          1.0: 301301,
          3.0: 271395,
          4.0: 42236,
          9.0: 8785,
          5.0: 18,
          7.0: 4}),
 Counter({0.0: 1739411, 1.0: 1082769}))

In [462]:
wl = wl_bst_sm.predict(test)
sh = sh_bst_sm.predict(test)
Counter(wl), Counter(Counter(sh))

(Counter({1.0: 1311788,
          0.0: 788117,
          9.0: 681228,
          2.0: 23596,
          4.0: 741,
          8.0: 9749,
          3.0: 3041,
          6.0: 2609,
          7.0: 910,
          5.0: 401}),
 Counter({1.0: 1131623, 0.0: 1690557}))

In [989]:
test_df['watch_label'] = wl.astype(np.uint8)
test_df['is_share'] = sh.astype(np.uint8)
test_df.shape

(2822180, 4)

In [993]:
fn = f'../submission-{int(time())}.csv'
test_df.to_csv(fn, index=False, sep=",")
print(f"new submission saved to {fn}")

new submission saved to ../submission-1625145233.csv


In [990]:
tdf = pd.read_csv('../submission-1625116622.csv')
tdf.shape

(2822180, 4)

In [991]:
widx = test_df['watch_label'] != tdf['watch_label']
sidx = test_df['is_share'] != tdf['is_share']

In [992]:
widx.sum(), sidx.sum()

(2104451, 1253446)

## 保存模型

In [995]:
bst_wl.save_model('wl_model_v6')
bst_sh.save_model('sh_model_v6')

In [996]:
def write_log(log_name, info, log_path="./"):
    import datetime
    with open(os.path.join(log_path, log_name), 'w') as log:
        log.write(f"# {datetime.datetime.now().__str__()}\n")
        log.write(f"\n## model name: {info['model_name']}\n")
        log.write(f"- model save path : {info['model_save_path']}\n")
        
        log.write(f"\n## Data setup\n")
        log.write(f"- dataset.shape : {dataset.shape}\n")
        log.write(f"- dataset.columns : {dataset.columns}\n")
        log.write(f"- is resample : {info['is_resample']}\n")
        log.write(f"- Traing_Data.shape (watch_label)  : {X_train.shape}\n")
        log.write(f"- Testing_Data.shape (watch_label) : {X_test.shape}\n")
        log.write(f"- Traing_Data.shape (is_share)  : {X_train_sh.shape}\n")
        log.write(f"- Testing_Data.shape (is_share) : {X_test_sh.shape}\n")
        if info.get('is_resample', False):
            log.write(f"- Resampled class distribution (watch_label): \n{Counter(resampled_wl)}\n")
            log.write(f"- Resampled class distribution (is_share): \n{Counter(resampled_sh)}\n")
            
        log.write(f"\n## Model Params\n")
        log.write(f"- model params (watch_label) : \n{param}\n")
        log.write(f"- model params (is_share) : \n{param_sh}\n")
        
        log.write(f"\n## Model's Performance\n")
        log.write(f"- Aucs (watch_label) : {aucs}\n")
        log.write(f"- Weighted Aucs (watch_label) : {w_aucs}\n")
        log.write(f"- Aucs (is_share) : {aucs_sh}\n")
        
        log.write(f"- Classification Report (watch_label) : \n\n{report}\n")
        log.write(f"- Classification Report (is_share) : \n\n{report_sh}\n")
        
        log.flush()
        
    pass

In [997]:
log_name = "log_v6.md"
info = {'is_resample': True, 'model_name': ['wl_model_v6', 'sh_model_v6'], 'model_save_path': os.getcwd()}
write_log(log_name, info)

# 服务器间同步文件

## 推向Digix服务器

In [994]:
!scp ./models.ipynb digix@49.123.120.71:/home/digix/digix/Models/models.ipynb 

models.ipynb                                  100%  115KB   9.8MB/s   00:00    


In [472]:
!scp ../explore-data.ipynb digix@49.123.120.71:/home/digix/digix/explore-data.ipynb 

explore-data.ipynb                            100%  306KB  10.6MB/s   00:00    


## 从Digix服务器拉数据

In [583]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/LightGBM.ipynb ./LightGBM.ipynb

LightGBM.ipynb                                100%   71KB   9.0MB/s   00:00    


In [584]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/feature_engineering.ipynb ./feature_engineering.ipynb

scp: /home/digix/digix/Models/feature_engineering.ipynb: No such file or directory


In [585]:
!scp -r digix@49.123.120.71:/home/digix/digix/Models/Feature_Engineering/  ./

data_analysis.ipynb                           100% 6493KB  11.2MB/s   00:00    
data_analysis-checkpoint.ipynb                100% 6493KB  11.2MB/s   00:00    
