In [267]:
import numpy as np
from sklearn import metrics
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from time import time
import pandas as pd
import xgboost as xgb
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
import datatable as dt

# 给定预测标签，计算AUC
使用OVR的策略计算每个类别的AUC
过程：
- 选择类别i作为正类，其他类别作为负类
- 将真实标签中不等于i的标记为0，等于i的标记为1
- 将预测标签中不等于i的标记为0，等于ide标记为1
- 计算混淆矩阵
- 计算(fpr, tpr)
- 计算AUC

In [205]:
y = np.random.randint(0, 10, 100)
p = np.random.randint(0, 10, 100)

In [30]:
def confusion_matrix(label, predict, n):
    """
    计算混淆矩阵
    :param label: 标签，np.array类型。形状可以是(n_sample,) 或者 (n_sample, n_classes)，当为第二种形状时可以表示多标签分类的情况
    :param predict: 预测值，与 `label` 同理
    :param n: 类别数目
    :return: 混淆矩阵，np.array类型。shape 为 (n, n)。$cm_{ij}$表示真实标签为 $i$，预测标签为 $j$ 的样本个数
    """
    k = (label >= 0) & (label < n)
    # bincount()函数用于统计数组内每个非负整数的个数
    # 详见 https://docs.scipy.org/doc/numpy/reference/generated/numpy.bincount.html
    return np.bincount(n * label[k].astype(int) + predict[k], minlength=n ** 2).reshape(n, n)


def auc(y, p, classes):
    """
    给定真实标签和预测标签，计算每个类别的auc值。实际只算出了roc曲线上一个点，即一个(fpr, tpr)，再并上(0, 0)和(1, 1)来计算auc
    :param y: 标签，np.array类型
    :param p: 预测标签，np.array类型
    :param classes: 类别，list-like，表示有哪些类别
    """
    all_aucs = np.zeros(len(classes))
    for i, c in enumerate(classes):
        _y = np.zeros_like(y)
        _y[y==c] = 1
        _y[y!=c] = 0
        _p = np.zeros_like(p)
        _p[p==c] = 1
        _p[p!=c] = 0
#         print(_y, _p)
        cm = confusion_matrix(_y, _p, 2)
#         print(cm)
        tpr = (cm[0, 0] / (cm[0, 0] + cm[0, 1])) if (cm[0, 0] + cm[0, 1]) != 0 else 0
        fpr = (cm[1, 0] / (cm[1, 0] + cm[1, 1])) if (cm[1, 0] + cm[1, 1]) != 0 else 0
        tpr = [0, tpr, 1]
        fpr = [0, fpr, 1]
        auc = metrics.auc(fpr, tpr)
        all_aucs[i] = auc
        if _y.sum() == 0 or _p.sum() == 0:
            all_aucs[i] = 0
    return all_aucs

In [12]:
classes = list(range(10))
weights = np.arange(0, 1, 0.1)
all_aucs = auc(y, p, classes)

weighted_auc = (all_aucs * weights).sum()
print(f"{all_aucs}\n{weighted_auc}")

[0.5298913  0.65555556 0.52304147 0.50747508 0.52445652 0.58219623
 0.57264957 0.46842105 0.53379416 0.50795756]
2.3789687141650595


In [13]:
classes = list(range(2))
y = np.array([0, 0, 1, 1])
p = np.array([0, 1, 0, 1])
all_aucs = auc(y, p, classes)

print(f"{all_aucs}")

[0.5 0.5]


# 加载数据
训练数据加载过程：
1. 分别加载处理好的用户特征和视频特征，以及整合的用户历史行为数据；
2. 从用户历史行为数据中筛掉在视频特征中没出现过的video_id；
3. 将行为数据中的user_id、video_id替换为对应用户/视频的特征
4. 根据不同的任务划分为`watch_label`、`is_share`的数据集

推断时，类似于上述过程拼接数据。

In [2]:
base_dir = "../2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

In [3]:
user_   = np.load(os.path.join(train_data_dir, "user_features_data/user_features.npz"), allow_pickle=True)
video_  = np.load(os.path.join(train_data_dir, "video_features_data/video_features.npz"), allow_pickle=True)

In [4]:
user_feats, user_cols = user_['features'], user_['columns']
video_feats, video_cols = video_['features'], video_['columns']

In [5]:
user_df = pd.DataFrame(user_feats, columns=user_cols)
video_df = pd.DataFrame(video_feats, columns=video_cols)

## 加载训练数据

In [14]:
action_  = np.load(os.path.join(train_data_dir, "all_actions.npz"), allow_pickle=True) 

In [15]:
action_data, action_cols = action_['data'], action_['columns']

In [16]:
action_df = pd.DataFrame(action_data, columns=action_cols)

In [17]:
action_df.shape

(7353024, 5)

In [18]:
# 从用户历史行为数据中筛掉在视频特征中没出现过的video_id
idx1 = pd.Index(action_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'])
not_exists = idx1.difference(idx2)
not_exists

Int64Index([   15,   144,   428,   497,   876,  1174,  2127,  2199,  2334,
             3153,
            ...
            48069, 48269, 48343, 48626, 49103, 49241, 49404, 49419, 49793,
            50337],
           dtype='int64', length=243)

In [19]:
t0 = time()
n = 0
for vid in not_exists:
    tn = (action_df['video_id'] == vid).sum()
#     action_df = action_df[action_df['video_id'] != vid]
    action_df['video_id'].replace(vid, np.nan, inplace=True)
    n += tn
action_df.dropna(axis=0, inplace=True)
print(f"在视频特征中不存在的video_id在行为数据集中出现的次数 = {n}\t\t(cost {time() - t0:.3f}s)")

在视频特征中不存在的video_id在行为数据集中出现的次数 = 45006		(cost 10.650s)


In [21]:
action_df.shape

(7308018, 5)

In [20]:
video_action = video_df.merge(action_df, how='right', left_on='video_id', right_on='video_id')

In [28]:
# print(video_action.columns)
video_action.head()

Unnamed: 0,video_id,video_name,video_score,video_duration,video_release_year,video_release_month,video_release_day,desc_0,desc_1,desc_2,...,class_9,da_0,da_1,da_2,da_3,da_4,user_id,is_watch,is_share,watch_label
0,28149,人潮汹涌,0.8,265,2021.0,2.0,12.0,0.277669,0.591679,0.016331,...,0.037483,0.075849,0.075966,0.388902,0.383616,0.075666,4239342,1,0,2
1,115,数码宝贝：最后的进化,0.81,194,2020.0,10.0,30.0,0.026072,0.026047,0.026042,...,0.626961,0.084394,0.361578,0.082888,0.082888,0.388253,3577036,1,0,0
2,3636,东海异闻录,0.77,3807,2021.0,4.0,24.0,0.018332,0.351056,0.018328,...,0.041772,0.324848,0.083544,0.083544,0.083544,0.424519,5527504,1,0,5
3,12968,刺杀小说家,0.79,7791,2021.0,2.0,12.0,0.022649,0.022656,0.022645,...,0.034261,0.50428,0.072258,0.070551,0.069007,0.283903,1117889,1,0,0
4,860,飞驰人生,0.84,5889,2019.0,2.0,5.0,0.345659,0.029107,0.029101,...,0.037924,0.076261,0.39165,0.077193,0.075847,0.379048,1117889,1,0,4


In [29]:
user_video_action = user_df.merge(video_action, how='right', left_on='user_id', right_on='user_id')

In [30]:
user_video_action.shape, user_video_action.columns

((7308018, 76),
 Index(['user_id', 'age_0', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5',
        'age_6', 'age_7', 'gender_0', 'gender_1', 'gender_2', 'gender_3',
        'city_level_0', 'city_level_1', 'city_level_2', 'city_level_3',
        'city_level_4', 'city_level_5', 'city_level_6', 'city_level_7',
        'device_name_0', 'device_name_1', 'device_name_2', 'device_name_3',
        'device_name_4', 'device_name_5', 'device_name_6', 'device_name_7',
        'device_name_8', 'device_name_9', 'video_id', 'video_name',
        'video_score', 'video_duration', 'video_release_year',
        'video_release_month', 'video_release_day', 'desc_0', 'desc_1',
        'desc_2', 'desc_3', 'desc_4', 'desc_5', 'desc_6', 'desc_7', 'desc_8',
        'desc_9', 'tags_0', 'tags_1', 'tags_2', 'tags_3', 'tags_4', 'tags_5',
        'tags_6', 'tags_7', 'tags_8', 'tags_9', 'class_0', 'class_1', 'class_2',
        'class_3', 'class_4', 'class_5', 'class_6', 'class_7', 'class_8',
        'class_9', 'da_0', 

In [31]:
# 删除 video_name 列，调整video_id列的顺序
user_video_action.drop(['video_name', 'is_watch'], axis=1, inplace=True)
user_video_action.insert(1, 'video_id', user_video_action.pop('video_id'))
user_video_action.head()

Unnamed: 0,user_id,video_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,...,class_7,class_8,class_9,da_0,da_1,da_2,da_3,da_4,is_share,watch_label
0,4239342.0,28149,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.037486,0.395894,0.037483,0.075849,0.075966,0.388902,0.383616,0.075666,0,2
1,3577036.0,115,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041444,0.041444,0.626961,0.084394,0.361578,0.082888,0.082888,0.388253,0,0
2,5527504.0,3636,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041779,0.04178,0.041772,0.324848,0.083544,0.083544,0.083544,0.424519,0,5
3,1117889.0,12968,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.034266,0.034261,0.034261,0.50428,0.072258,0.070551,0.069007,0.283903,0,0
4,1117889.0,860,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.037929,0.037924,0.037924,0.076261,0.39165,0.077193,0.075847,0.379048,0,4


In [34]:
# 保存数据以便下次直接读取
np.savez(os.path.join(train_data_dir, "train"), data=user_video_action.values, columns=user_video_action.columns.tolist())

In [161]:
npz = np.load(os.path.join(train_data_dir, "train.npz"), allow_pickle=True)

In [188]:
npz['data'].dtype

dtype('O')

In [200]:
t0 = time()
user_video_action = pd.DataFrame(npz['data'], columns=npz['columns'], dtype=np.float32)  # 注意加载类型
print(f"Data loading costs {time()-t0:.3f}s ...")
user_video_action.shape, user_video_action.info()

Data loading costs 176.350s ...


(7308018, 74)

In [208]:
# 将 pandas.DataFrame 保存未 .jay 文件
tab = dt.Frame(user_video_action)
tab.to_jay(os.path.join(train_data_dir, "train.jay"))

In [384]:
tab = dt.fread(os.path.join(train_data_dir, "train.jay"))
del tab[:, ['video_id', 'user_id']]

In [385]:
user_video_action = tab.to_pandas()

In [None]:
# 删除 video_id、user_id列
user_video_action.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [387]:
user_video_action.shape, user_video_action.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7308018 entries, 0 to 7308017
Data columns (total 72 columns):
 #   Column               Dtype  
---  ------               -----  
 0   age_0                float32
 1   age_1                float32
 2   age_2                float32
 3   age_3                float32
 4   age_4                float32
 5   age_5                float32
 6   age_6                float32
 7   age_7                float32
 8   gender_0             float32
 9   gender_1             float32
 10  gender_2             float32
 11  gender_3             float32
 12  city_level_0         float32
 13  city_level_1         float32
 14  city_level_2         float32
 15  city_level_3         float32
 16  city_level_4         float32
 17  city_level_5         float32
 18  city_level_6         float32
 19  city_level_7         float32
 20  device_name_0        float32
 21  device_name_1        float32
 22  device_name_2        float32
 23  device_name_3        float32
 24

((7308018, 72), None)

In [388]:
dataset = user_video_action

## 加载测试数据

In [146]:
test_df = pd.read_csv(os.path.join(test_data_dir, "test.csv"))

In [147]:
test_df.columns

Index(['user_id', 'video_id'], dtype='object')

In [90]:
# 测试数据集中存在video_id没有在视频特征中出现
idx1 = pd.Index(test_df['video_id'].unique())
idx2 = pd.Index(video_df['video_id'].unique())
non_exists = idx1.difference(idx2)
non_exists

Int64Index([   15,   144,   428,   497,   876,  1174,  1589,  1906,  2127,
             2199,
            ...
            47945, 48069, 48269, 48343, 48626, 49241, 49404, 49419, 49793,
            50337],
           dtype='int64', length=276)

In [265]:
video_test = video_df.merge(test_df, how='right', left_on='video_id', right_on='video_id')
user_video_test = user_df.merge(video_test, how='right', left_on='user_id', right_on='user_id')

In [266]:
user_video_test.insert(1, 'video_id', user_video_test.pop('video_id'))

(2822180, 75)

In [270]:
user_video_test.drop('video_name', axis=1, inplace=True)

In [96]:
# 保存数据以便下次直接读取
np.savez(os.path.join(test_data_dir, "test"), data=user_video_test.values, columns=user_video_test.columns.tolist())

In [233]:
npz = np.load(os.path.join(test_data_dir, "test.npz"), allow_pickle=True)

In [None]:
npz['data'].shape

In [235]:
t0 = time()
user_video_test = pd.DataFrame(npz['data'], columns=npz['columns'])
print(f"Data loading costs {time()-t0:.3f}s ...")
user_video_test.shape, user_video_test.info()

Data loading costs 61.867s ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 73 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   user_id              object
 1   video_id             object
 2   age_0                object
 3   age_1                object
 4   age_2                object
 5   age_3                object
 6   age_4                object
 7   age_5                object
 8   age_6                object
 9   age_7                object
 10  gender_0             object
 11  gender_1             object
 12  gender_2             object
 13  gender_3             object
 14  city_level_0         object
 15  city_level_1         object
 16  city_level_2         object
 17  city_level_3         object
 18  city_level_4         object
 19  city_level_5         object
 20  city_level_6         object
 21  city_level_7         object
 22  device_name_0        object
 23  device_name_1        objec

((2822180, 73), None)

In [245]:
user_video_test.drop('video_name', axis=1, inplace=True)
user_video_test = user_video_test.astype(np.float32)

In [246]:
# 将 pandas.DataFrame 保存未 .jay 文件
tab = dt.Frame(user_video_test)
tab.to_jay(os.path.join(test_data_dir, "test.jay"))

In [248]:
tab = dt.fread(os.path.join(test_data_dir, "test.jay"))
del tab[:, ['user_id', 'video_id']]

In [380]:
user_video_test = tab.to_pandas()
user_video_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2822180 entries, 0 to 2822179
Data columns (total 70 columns):
 #   Column               Dtype  
---  ------               -----  
 0   age_0                float32
 1   age_1                float32
 2   age_2                float32
 3   age_3                float32
 4   age_4                float32
 5   age_5                float32
 6   age_6                float32
 7   age_7                float32
 8   gender_0             float32
 9   gender_1             float32
 10  gender_2             float32
 11  gender_3             float32
 12  city_level_0         float32
 13  city_level_1         float32
 14  city_level_2         float32
 15  city_level_3         float32
 16  city_level_4         float32
 17  city_level_5         float32
 18  city_level_6         float32
 19  city_level_7         float32
 20  device_name_0        float32
 21  device_name_1        float32
 22  device_name_2        float32
 23  device_name_3        float32
 24

In [138]:
# 删除 video_id、user_id 列
user_video_test.drop(['user_id', 'video_id'], axis=1, inplace=True)

In [251]:
inference_dataset = user_video_test

In [252]:
inference_dataset.shape

(2822180, 70)

# XGBoost

In [None]:
def train_xgb(X_train, y_train, params):
    xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
    xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 5
    # train xgb
    bst = xgb.train(_param, xg_train, num_round, watchlist)
    # get prediction
    pred = bst.predict(xg_test)
    error_rate = np.sum(pred != y_test) / test_y.shape[0]
    print('Test error using softmax = {}'.format(error_rate))

    # do the same thing again, but output probabilities
    param['objective'] = 'multi:softprob'
    bst = xgb.train(param, xg_train, num_round, watchlist)
    # Note: this convention has been changed since xgboost-unity
    # get prediction, this is in 1D array, need reshape to (ndata, nclass)
    pred_prob = bst.predict(xg_test).reshape(y_test.shape[0], 6)
    pred_label = np.argmax(pred_prob, axis=1)
    error_rate = np.sum(pred_label != y_test) / y_test.shape[0]
    print('Test error using softprob = {}'.format(error_rate))


In [389]:
# 准备数据
watch_label = dataset.pop('watch_label').astype(np.uint8)
is_share = dataset.pop('is_share').astype(np.uint8)

In [None]:
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=0)

In [281]:
X_train = dataset.iloc[train_idx]
X_test  = dataset.iloc[test_idx]

In [283]:
X_train.shape

(5846414, 70)

## watch_label 预测

### 处理数据不均衡问题

In [440]:
items = list(Counter(watch_label).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / watch_label.shape[0]
print(dist)

[(0, 5146219), (1, 554320), (2, 312266), (3, 217820), (4, 171292), (5, 142012), (6, 124245), (7, 116928), (8, 137834), (9, 385082)]
[[0.         0.70418806]
 [1.         0.07585094]
 [2.         0.04272923]
 [3.         0.02980562]
 [4.         0.02343891]
 [5.         0.01943235]
 [6.         0.01700119]
 [7.         0.01599996]
 [8.         0.01886065]
 [9.         0.05269308]]


In [441]:
under_ss = np.array(items)
under_ss_thresh = under_ss[9, 1]  # 设置每个类别样本数目的上限
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[2, 1]  # 设置每个类别样本数据的下限
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [442]:
under_ss, over_ss

({0: 385082,
  1: 385082,
  2: 312266,
  3: 217820,
  4: 171292,
  5: 142012,
  6: 124245,
  7: 116928,
  8: 137834,
  9: 385082},
 {0: 385082,
  1: 385082,
  2: 312266,
  3: 312266,
  4: 312266,
  5: 312266,
  6: 312266,
  7: 312266,
  8: 312266,
  9: 385082})

In [443]:
idxs = watch_label == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(5146219,)

In [444]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留，注意replace参数，为True时会重复采样
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((4761137,), (385082,))

In [445]:
Counter(watch_label)

Counter({2: 312266,
         0: 5146219,
         5: 142012,
         4: 171292,
         1: 554320,
         9: 385082,
         3: 217820,
         8: 137834,
         7: 116928,
         6: 124245})

In [446]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_wl = np.delete(watch_label.values, del_idxs, axis=0)
resampled_data.shape, resampled_wl.shape

((2546881, 70), (2546881,))

In [447]:
Counter(resampled_wl)

Counter({2: 312266,
         5: 142012,
         4: 171292,
         1: 554320,
         9: 385082,
         3: 217820,
         0: 385082,
         8: 137834,
         7: 116928,
         6: 124245})

In [273]:
# 速度太慢，难以忍受！
nm  = TomekLinks(sampling_strategy=under_ss)
smt = SMOTE(sampling_strategy=over_ss)

In [None]:
t0 = time()
X_r, y_r = nm.fit_resample(resampled_data, pd.Series(resampled_wl))
print(f"Under Sampling finished ...\t\t({time()-t0:.3f}s)")

In [None]:
X_r, y_r = smt.fit_resample(X_r, y_r)

In [448]:
# 将采样后的数据重装回 DataFrame
data = pd.DataFrame(resampled_data, columns=dataset.columns)
watch_label_res = pd.Series(resampled_wl)
data.shape, watch_label.shape

((2546881, 70), (7308018,))

In [449]:
train_idx, test_idx = train_test_split(data.index, test_size=0.2, random_state=0)
train_idx.shape, test_idx.shape

((2037504,), (509377,))

### 训练模型

In [450]:
X_train = data.iloc[train_idx]
X_test  = data.iloc[test_idx]

In [451]:
y_train = watch_label_res.iloc[train_idx]
y_test  = watch_label_res.iloc[test_idx]

In [452]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2037504, 70), (2037504,), (509377, 70), (509377,))

In [453]:
t0 = time()
xg_train = xgb.DMatrix(X_train.values, label=y_train.values, enable_categorical=True)
xg_test = xgb.DMatrix(X_test.values, label=y_test.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(1.798s)


In [454]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 8
param['nthread'] = 8
param['num_class'] = 10
param['gpu_id'] = 0
param['tree_method'] = 'gpu_hist'
# param['scale_pos_weight'] = 2
watchlist = [(xg_train, 'train'), (xg_test, 'test')]

In [455]:
num_round = 250
t0 = time()
wl_bst_sm = xgb.train(param, xg_train, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-mlogloss:2.27475	test-mlogloss:2.27492
[1]	train-mlogloss:2.25202	test-mlogloss:2.25229
[2]	train-mlogloss:2.23241	test-mlogloss:2.23282
[3]	train-mlogloss:2.21584	test-mlogloss:2.21638
[4]	train-mlogloss:2.20143	test-mlogloss:2.20211
[5]	train-mlogloss:2.18868	test-mlogloss:2.18952
[6]	train-mlogloss:2.17747	test-mlogloss:2.17848
[7]	train-mlogloss:2.16754	test-mlogloss:2.16868
[8]	train-mlogloss:2.15873	test-mlogloss:2.16002
[9]	train-mlogloss:2.15075	test-mlogloss:2.15222
[10]	train-mlogloss:2.14374	test-mlogloss:2.14535
[11]	train-mlogloss:2.13735	test-mlogloss:2.13912
[12]	train-mlogloss:2.13172	test-mlogloss:2.13366
[13]	train-mlogloss:2.12663	test-mlogloss:2.12874
[14]	train-mlogloss:2.12199	test-mlogloss:2.12429
[15]	train-mlogloss:2.11789	test-mlogloss:2.12036
[16]	train-mlogloss:2.11397	test-mlogloss:2.11662
[17]	train-mlogloss:2.11039	test-mlogloss:2.11324
[18]	train-mlogloss:2.10729	test-mlogloss:2.11032
[19]	train-mlogloss:2.10433	test-mlogloss:2.10756
[20]	train

In [456]:
# get prediction
pred = wl_bst_sm.predict(xg_test)
# pred = pred.astype(np.uint8)
error_rate = np.sum(pred != y_test) / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.7282386130508445


In [457]:
weights = np.arange(0, 1, 0.1)
aucs = auc(y_test.astype(np.uint8), pred.astype(np.uint8), np.arange(param['num_class']))
# aucs[aucs == 0.5] = 0
w_aucs = (aucs * weights).sum()
aucs, w_aucs

(array([0.62004039, 0.58891123, 0.50362896, 0.50026904, 0.50024575,
        0.50008881, 0.50302352, 0.50024341, 0.50804977, 0.59886517]),
 2.357243306851747)

In [458]:
report = metrics.classification_report(list(y_test), list(pred))

In [459]:
print(report)

              precision    recall  f1-score   support

           0       0.32      0.38      0.35     77054
           1       0.28      0.63      0.39    110718
           2       0.28      0.01      0.02     62540
           3       0.15      0.00      0.00     43797
           4       0.21      0.00      0.00     33993
           5       0.14      0.00      0.00     28208
           6       0.30      0.01      0.01     24911
           7       0.12      0.00      0.00     23415
           8       0.28      0.02      0.04     27510
           9       0.23      0.48      0.31     77231

    accuracy                           0.27    509377
   macro avg       0.23      0.15      0.11    509377
weighted avg       0.25      0.27      0.19    509377



### 调参

In [481]:
from xgboost import XGBClassifier
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [532]:
base_param = {  # 基本参数，不需要调参
    'objective': 'multi:softmax',
    'eta': 0.1,
    'max_depth': 8,
    'nthread': 8,
    'num_class': 10
} 
ps = {  # 需要调参的参数
    'max_depth': list(range(5, 12, 2)),
    'min_child_weight': list(range(1, 8, 2))
}

tmp = []
keys = list(ps.keys())
k = keys[0]
tmp.extend([[e] for e in ps[k].copy()])

# 将需要调参的参数进行组合，即笛卡尔乘积
for k in keys[1:]:
    v = ps[k].copy()
    l = len(tmp)
    tmp = [tmp[i%l].copy() for i in range(len(v) * len(tmp))]
    for i, e in enumerate(tmp):
        e.append(v[i // l])
        
com_ps = [dict(zip(keys, e)) for e in tmp]
# print(com_ps)
all_params = [base_param.copy() for _ in range(len(com_ps))] 
for i in range(len(com_ps)):
    all_params[i].update(com_ps[i])
    
print(all_params)

[{'max_depth': 5, 'min_child_weight': 1}, {'max_depth': 7, 'min_child_weight': 1}, {'max_depth': 9, 'min_child_weight': 1}, {'max_depth': 11, 'min_child_weight': 1}, {'max_depth': 5, 'min_child_weight': 3}, {'max_depth': 7, 'min_child_weight': 3}, {'max_depth': 9, 'min_child_weight': 3}, {'max_depth': 11, 'min_child_weight': 3}, {'max_depth': 5, 'min_child_weight': 5}, {'max_depth': 7, 'min_child_weight': 5}, {'max_depth': 9, 'min_child_weight': 5}, {'max_depth': 11, 'min_child_weight': 5}, {'max_depth': 5, 'min_child_weight': 7}, {'max_depth': 7, 'min_child_weight': 7}, {'max_depth': 9, 'min_child_weight': 7}, {'max_depth': 11, 'min_child_weight': 7}]
[{'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 5, 'nthread': 8, 'num_class': 10, 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 7, 'nthread': 8, 'num_class': 10, 'min_child_weight': 1}, {'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 9, 'nthread': 8, 'num_class': 10, 'min_child_weight': 

In [490]:
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2), 
}
param_test2 = {
'max_depth':[4,5,6],
'min_child_weight':[4,5,6]
}

clf = XGBClassifier(learning_rate =0.1, n_estimators=200, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
        colsample_bytree=0.8, objective= 'multi:logistic', nthread=8, gpu_id=0, seed=27)

In [None]:
%%time
gsearch1 = GridSearchCV(estimator=clf, param_grid = param_test1, scoring='roc_auc_ovr', n_jobs=8, cv=5, verbose=3)
gsearch1.fit(data, watch_label_res)
gsearch1.grid_scores_, gsearch1.best_params_,gsearch1.best_score_

In [494]:
cv_data = xgb.DMatrix(data.values, label=watch_label_res.values, enable_categorical=True)

In [496]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 8
param['nthread'] = 8
param['num_class'] = 10
# param['gpu_id'] = 0
# param['tree_method'] = 'gpu_hist'

In [500]:
%%time
cv_res= xgb.cv(param, cv_data, num_boost_round=200,early_stopping_rounds=30,nfold=3, metrics='auc',show_stdv=True)
print(cv_res)

XGBoostError: [22:52:21] ../src/tree/updater_gpu_hist.cu:793: Exception in gpu_hist: [22:52:21] ../src/c_api/../data/../common/device_helpers.cuh:414: Memory allocation error on worker 0: [22:52:21] ../src/c_api/../data/../common/common.h:45: ../src/common/device_helpers.cuh: 433: out of memory
Stack trace:
  [bt] (0) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x9133f) [0x7f77e0ef233f]
  [bt] (1) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x328d14) [0x7f77e1189d14]
  [bt] (2) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x384451) [0x7f77e11e5451]
  [bt] (3) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x384c2e) [0x7f77e11e5c2e]
  [bt] (4) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x37b9f0) [0x7f77e11dc9f0]
  [bt] (5) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3368d0) [0x7f77e11978d0]
  [bt] (6) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x33822d) [0x7f77e119922d]
  [bt] (7) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x39132b) [0x7f77e11f232b]
  [bt] (8) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x39186e) [0x7f77e11f286e]


- Free memory: 17367040
- Requested memory: 16057440

Stack trace:
  [bt] (0) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x9133f) [0x7f77e0ef233f]
  [bt] (1) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x329275) [0x7f77e118a275]
  [bt] (2) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x38487b) [0x7f77e11e587b]
  [bt] (3) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x384c2e) [0x7f77e11e5c2e]
  [bt] (4) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x37b9f0) [0x7f77e11dc9f0]
  [bt] (5) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x3368d0) [0x7f77e11978d0]
  [bt] (6) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x33822d) [0x7f77e119922d]
  [bt] (7) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x39132b) [0x7f77e11f232b]
  [bt] (8) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x39186e) [0x7f77e11f286e]



Stack trace:
  [bt] (0) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x9133f) [0x7f77e0ef233f]
  [bt] (1) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x4e9a88) [0x7f77e134aa88]
  [bt] (2) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x18c862) [0x7f77e0fed862]
  [bt] (3) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x18ead8) [0x7f77e0fefad8]
  [bt] (4) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b9b93) [0x7f77e101ab93]
  [bt] (5) /home/gzy/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x50) [0x7f77e0ee1ed0]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.7(+0x6ff5) [0x7f79e555aff5]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.7(+0x640a) [0x7f79e555a40a]
  [bt] (8) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x58c) [0x7f79e460629c]



## is_share 预测

### 处理数据不均衡问题

In [422]:
items = list(Counter(is_share).items())
items.sort(key=lambda x: x[0])
print(items)

dist = np.array(items, dtype=np.float)
dist[:, 1] = dist[:, 1] / is_share.shape[0]
print(dist)

[(0, 7293752), (1, 14266)]
[[0.        0.9980479]
 [1.        0.0019521]]


In [423]:
under_ss = np.array(items)
under_ss_thresh = under_ss[1, 1]
under_ss[:, 1] = np.clip(under_ss[:, 1], a_min=None, a_max=under_ss_thresh)

over_ss = under_ss.copy()
over_ss_thresh = under_ss[1, 1]
over_ss[:, 1] = np.clip(over_ss[:, 1], a_min=over_ss_thresh, a_max=None)

under_ss = dict(under_ss)
over_ss = dict(over_ss)

In [424]:
under_ss, over_ss

({0: 14266, 1: 14266}, {0: 14266, 1: 14266})

In [425]:
idxs = is_share == 0
idxs = idxs.replace(False, np.nan).dropna().index  # 保留watch_label=0的行索引
idxs.shape

(7293752,)

In [426]:
left_idxs = np.random.choice(idxs, under_ss_thresh, replace=False)  # 选择一部分保留
del_idxs = idxs.difference(left_idxs)
del_idxs.shape, left_idxs.shape

((7279486,), (14266,))

In [482]:
resampled_data = np.delete(dataset.values, del_idxs, axis=0)
resampled_sh = np.delete(is_share.values, del_idxs, axis=0)
resampled_data.shape, resampled_sh.shape

((2546881, 70), (2546881,))

In [428]:
# 将采样后的数据重装会DataFrame
data_sh = pd.DataFrame(resampled_data, columns=dataset.columns)
is_share_res = pd.Series(resampled_sh)
data.shape, is_share.shape

((28532, 70), (7308018,))

In [429]:
train_idx, test_idx = train_test_split(data_sh.index, test_size=0.2, random_state=1)
train_idx.shape, test_idx.shape

((22825,), (5707,))

### 训练模型

In [430]:
X_train_sh = data_sh.iloc[train_idx]
X_test_sh  = data_sh.iloc[test_idx]

In [431]:
y_train_sh = is_share_res.iloc[train_idx]
y_test_sh  = is_share_res.iloc[test_idx]

In [432]:
t0 = time()
xg_train_sh = xgb.DMatrix(X_train_sh.values, label=y_train_sh.values, enable_categorical=True)
xg_test_sh = xgb.DMatrix(X_test_sh.values, label=y_test_sh.values, enable_categorical=True)
print(f"Data preparing finished ...\t\t({time()-t0:.3f}s)")

Data preparing finished ...		(0.014s)


In [433]:
xg_train_sh.num_col()

70

In [434]:
# setup parameters for xgboost
param_sh = {}
# use softmax multi-class classification
param_sh['objective'] = 'binary:hinge'
# scale weight of positive examples
param_sh['eta'] = 0.1
param_sh['max_depth'] = 7
param_sh['nthread'] = 4
# param['num_class'] = 2
param_sh['gpu_id'] = 0
param_sh['tree_method'] = 'gpu_hist'
# param_sh['scale_pos_weight'] = 2

watchlist = [(xg_train_sh, 'train'), (xg_test_sh, 'test')]

In [435]:
num_round = 250
t0 = time()
sh_bst_sm = xgb.train(param_sh, xg_train_sh, num_round, watchlist)
print(f"{num_round}-rounds Training finished ...\t\t({time()-t0:.3f}s)")

[0]	train-error:0.50217	test-error:0.49133
[1]	train-error:0.50217	test-error:0.49133
[2]	train-error:0.50217	test-error:0.49133
[3]	train-error:0.50217	test-error:0.49133
[4]	train-error:0.50217	test-error:0.49133
[5]	train-error:0.49774	test-error:0.48992
[6]	train-error:0.49424	test-error:0.48975
[7]	train-error:0.49249	test-error:0.48940
[8]	train-error:0.47820	test-error:0.47608
[9]	train-error:0.46287	test-error:0.46785
[10]	train-error:0.45932	test-error:0.46697
[11]	train-error:0.45196	test-error:0.46434
[12]	train-error:0.44565	test-error:0.46101
[13]	train-error:0.43579	test-error:0.45453
[14]	train-error:0.42988	test-error:0.45138
[15]	train-error:0.42361	test-error:0.44787
[16]	train-error:0.41963	test-error:0.44367
[17]	train-error:0.41073	test-error:0.43841
[18]	train-error:0.40022	test-error:0.42334
[19]	train-error:0.39689	test-error:0.42299
[20]	train-error:0.39531	test-error:0.42106
[21]	train-error:0.39194	test-error:0.41931
[22]	train-error:0.38778	test-error:0.4177

In [436]:
# get prediction
pred_sh = sh_bst_sm.predict(xg_test_sh)
error_rate = np.sum(pred_sh != y_test_sh) / y_test_sh.shape[0]
print('Test error using softmax = {}'.format(error_rate))

Test error using softmax = 0.3777816716313299


In [437]:
Counter(pred_sh), Counter(y_test_sh)

(Counter({0.0: 2640, 1.0: 3067}), Counter({1: 2903, 0: 2804}))

In [438]:
report_sh = metrics.classification_report(list(y_test_sh), list(pred_sh))
print(report_sh)

              precision    recall  f1-score   support

           0       0.62      0.59      0.60      2804
           1       0.62      0.66      0.64      2903

    accuracy                           0.62      5707
   macro avg       0.62      0.62      0.62      5707
weighted avg       0.62      0.62      0.62      5707



In [439]:
aucs_sh = auc(y_test_sh.astype(np.uint8), pred_sh.astype(np.uint8), [0, 1])
aucs_sh

array([0.62160596, 0.62160596])

## 保存模型

In [468]:
wl_bst_sm.save_model('wl_model_v4')
sh_bst_sm.save_model('sh_model_v4')

In [469]:
def write_log(log_name, info, log_path="./"):
    import datetime
    with open(os.path.join(log_path, log_name), 'w') as log:
        log.write(f"# {datetime.datetime.now().__str__()}\n")
        log.write(f"\n## model name: {info['model_name']}\n")
        log.write(f"- model save path : {info['model_save_path']}\n")
        
        log.write(f"\n## Data setup\n")
        log.write(f"- dataset.shape : {dataset.shape}\n")
        log.write(f"- dataset.columns : {dataset.columns}\n")
        log.write(f"- is resample : {info['is_resample']}\n")
        log.write(f"- Traing_Data.shape (watch_label)  : {X_train.shape}\n")
        log.write(f"- Testing_Data.shape (watch_label) : {X_test.shape}\n")
        log.write(f"- Traing_Data.shape (is_share)  : {X_train_sh.shape}\n")
        log.write(f"- Testing_Data.shape (is_share) : {X_test_sh.shape}\n")
        if info.get('is_resample', False):
            log.write(f"- Resampled class distribution (watch_label): \n{Counter(resampled_wl)}\n")
            log.write(f"- Resampled class distribution (is_share): \n{Counter(resampled_sh)}\n")
            
        log.write(f"\n## Model Params\n")
        log.write(f"- model params (watch_label) : \n{param}\n")
        log.write(f"- model params (is_share) : \n{param_sh}\n")
        
        log.write(f"\n## Model's Performance\n")
        log.write(f"- Aucs (watch_label) : {aucs}\n")
        log.write(f"- Weighted Aucs (watch_label) : {w_aucs}\n")
        log.write(f"- Aucs (is_share) : {aucs_sh}\n")
        
        log.write(f"- Classification Report (watch_label) : \n\n{report}\n")
        log.write(f"- Classification Report (is_share) : \n\n{report_sh}\n")
        
        log.flush()
        
    pass

In [109]:
os.getcwd()

'/home/gzy/jupyter-lab/multi-objects-video-recommendation/Models'

In [470]:
log_name = "log_v4.md"
info = {'is_resample': True, 'model_name': ['wl_model_v3', 'sh_model_v3'], 'model_save_path': os.getcwd()}
write_log(log_name, info)

## 预测

In [460]:
test = inference_dataset
test = xgb.DMatrix(test.values, enable_categorical=True)

In [461]:
inference_dataset.shape, test.num_col()

((2822180, 70), 70)

In [462]:
wl = wl_bst_sm.predict(test)
sh = sh_bst_sm.predict(test)
Counter(wl), Counter(Counter(sh))

(Counter({1.0: 1311788,
          0.0: 788117,
          9.0: 681228,
          2.0: 23596,
          4.0: 741,
          8.0: 9749,
          3.0: 3041,
          6.0: 2609,
          7.0: 910,
          5.0: 401}),
 Counter({1.0: 1131623, 0.0: 1690557}))

In [463]:
test_df['watch_label'] = wl.astype(np.uint8)
test_df['is_share'] = sh.astype(np.uint8)
test_df.head()

Unnamed: 0,user_id,video_id,watch_label,is_share
0,1688013,32645,1,1
1,4502598,41270,0,1
2,5585629,16345,1,0
3,1635520,28149,1,1
4,4160191,40554,1,0


In [467]:
test_df.to_csv(f'../submission-{int(time())}.csv', index=False, sep=",")

In [464]:
tdf = pd.read_csv('../submission-1624972040.csv')
tdf.head()

Unnamed: 0,user_id,video_id,watch_label,is_share
0,1688013,32645,1,0
1,4502598,41270,0,1
2,5585629,16345,1,0
3,1635520,28149,1,1
4,4160191,40554,1,0


In [465]:
widx = test_df['watch_label'] != tdf['watch_label']
sidx = test_df['is_share'] != tdf['is_share']

In [477]:
widx.sum(), sidx.sum()

(465361, 551398)

# 服务器间同步文件

## 推向Digix服务器

In [471]:
!scp ./models.ipynb digix@49.123.120.71:/home/digix/digix/Models/models.ipynb 

models.ipynb                                  100%  110KB   9.9MB/s   00:00    


In [472]:
!scp ../explore-data.ipynb digix@49.123.120.71:/home/digix/digix/explore-data.ipynb 

explore-data.ipynb                            100%  306KB  10.6MB/s   00:00    


## 从Digix服务器拉数据

In [473]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/LightGBM.ipynb ./LightGBM.ipynb

LightGBM.ipynb                                100%   71KB   9.5MB/s   00:00    


In [474]:
!scp  digix@49.123.120.71:/home/digix/digix/Models/feature_engineering.ipynb ./feature_engineering.ipynb

scp: /home/digix/digix/Models/feature_engineering.ipynb: No such file or directory


In [475]:
!scp -r digix@49.123.120.71:/home/digix/digix/Models/Feature_Engineering/  ./

data_analysis.ipynb                           100% 4303KB  11.2MB/s   00:00    
data_analysis-checkpoint.ipynb                100% 4303KB  11.1MB/s   00:00    
