In [3]:
import os
from time import time
import datatable as dt
from datatable import f, join, by
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datatable import f, by
from sklearn.feature_selection import VarianceThreshold, SelectKBest

In [3]:
base_dir = "./2021_3_data"
test_data_dir  = os.path.join(base_dir, "testdata")
train_data_dir = os.path.join(base_dir, "traindata")

train_dirs = os.listdir(train_data_dir)
test_dirs  = os.listdir(test_data_dir)

action_dirs = os.listdir(os.path.join(train_data_dir, "history_behavior_data"))
action_files = {e: os.listdir(os.path.join(train_data_dir, f"history_behavior_data/{e}"))[0] for e in action_dirs}
print(train_dirs,'\n', test_dirs)

['train.jay', 'all_actions.npz', 'train.npz', 'all_actions.jay', 'user_features_data', 'history_behavior_data', 'video_features_data'] 
 ['test.npz', 'test.jay', 'test.csv', '.ipynb_checkpoints']


In [4]:
user_df = dt.fread(os.path.join(train_data_dir, "user_features_data/user_features_data.csv"), sep='\t')

# random walk
- 通过用户的观看记录的随机游走，生成视频的嵌入
- 通过视频被用户观看的记录的随机游走，生成用户的嵌入

需要考虑以下问题：
1. 随机游走序列的时间跨度是十四天还是时间窗口的形式
2. 随机游走序列能否覆盖所有用户或所有视频
3. 

In [5]:
%%time
all_df = None  # pd.DataFrame([], columns=['user_id', 'video', 'watch_labe', 'is_share'])

for i, d in enumerate(action_dirs):
    df = dt.fread(os.path.join(train_data_dir, f"history_behavior_data/{d}/{action_files[d]}"), sep="\t", )
#                      columns=['user_id', 'video_id', 'is_watch', 'is_sollect', 'is_comment', 'is_share', 'watch_label', 'pt_d'])
    del df[:, 'watch_start_time']
    if all_df is None:
        all_df = df
    else:
        all_df.rbind(df, bynames=True)
print(f"all history data read ...")

all history data read ...
CPU times: user 2min 13s, sys: 12.3 s, total: 2min 25s
Wall time: 28.3 s


In [6]:
all_df.shape

(80276856, 8)

In [30]:
dt.unique(all_df['user_id']).shape, dt.unique(all_df['video_id']).shape

((3953209, 1), (34218, 1))

In [54]:
# 并不是所有的用户都有行为数据，3953209用户有行为数据，每个用户平均有20.03068个视频记录
all_df[:, dt.count(), by("user_id")][:, dt.mean(f.count)]

Unnamed: 0_level_0,count
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪
0,20.3068


In [57]:
# 视频记录次数最少的用户只有 1 次，最多的有 76 次
all_df[:, dt.count(), by("user_id")][:,dt.max(f.count), dt.min(f.count)]

Unnamed: 0_level_0,count,count.0
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1,76


In [61]:
# 出现的视频总数 34218，其中还包括没有在视频特征中出现的视频，每个视频平均出现的次数是 2346.04，最多出现了1932565次，最少出现了 963 次
all_df[:, dt.count(), by("video_id")][:, dt.min(f.count), dt.max(f.count)]

Unnamed: 0_level_0,count,count.0
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1932565,963


# 用户上下文
为每个用户每天生成一个上下文向量。
对于用户$u$，其第$t$天的上下文向量$c_{u, t}$由$u$的$t-w, t-1$范围内的交互行为产生，$w$是时间窗口大小。

**不能简单的使用视频的特征作为视频的表征。基础特征里有一些特征不适合进行算术运算，有些特征进行算术运算后就失去了原本的含义**
目前方法：
- 选择一部分列作为视频的表征，基于方差选择
- 选择大部分列作为视频特征再进行降维作为视频表征
- 深度学习

In [174]:
# 每个strategy将一个行为向量转化为一个未归一化的参数
# 每个行为向量是一个向量
def mean_strategy(action):
    assert len(action) > 0
    return sum(action) / len(action)


def weight_strategy(action, weight=None):
    assert len(action) > 0
    if weight is None:
        weight = np.array([1/len(action)] * len(action))
    
    assert len(weight) == len(action)
    act = np.array(action)
    w = act * weight
    
    return sum(w)
    

def cal_context(actions, embeds, strategy, **kwargs):
    """
    根据交互行为，生成上下文向量
    """
    context = None
    w = list(map(lambda x: strategy(x, **kwargs), actions))
    z = sum(w)
    
    embs = np.array(embeds)
    w = np.array(w)
    w = w.reshape((len(w), 1))
    w_embs = np.multiply(embs, w).sum(axis=0)
    if z != 0:
        w_embs = w_embs / z
    
    return w_embs

behaviors = ['is_watch', 'is_share', 'is_collect', 'is_comment', 'watch_label']
# emb_cols = ['desc_0', 'desc_3', 'desc_4', 'desc_8', 'desc_9', 'tags_0', 'tags_3', 'tags_4', 'tags_6', 'tags_8', 
#             'class_0', 'class_2', 'class_5', 'class_8', 'da_0', 'da_1', 'da_2', 'da_3', 'da_4']
behavior_weights = [0.05, 0.3, 0.3, 0.2, .15]
def apply_one_user_one_day(df, video_features, emb_cols=None, strategy=mean_strategy):
    actions = df[behaviors].astype(np.uint8).values
    vids = df['video_id']
    
    assert video_features.key and video_features.key[0] == 'video_id'
    if emb_cols is None:
        emb_cols = video_features.names
    embs = video_features[vids, emb_cols]
#     print(actions)
    context = cal_context(actions, embs, weight_strategy, weight=behavior_weights)
#     print(context)
    return context


def select_emb_cols_by_variance(video_features, threshold=0.015):
    exclude_cols = ['video_id', 'video_name', 'video_release_year', 'video_release_month', 'video_release_day', 'video_duration']
    stds = list(video_features.sd().to_pandas().to_dict(orient='records')[0].items())
    
    stds = list(filter(lambda x: x[0] not in exclude_cols, stds))
    varis = list(map(lambda x: (x[0], x[1]**2), stds))

    emb_cols = list(filter(lambda x: x[1] > threshold, varis))
    emb_cols = [e[0] for e in emb_cols]
    return emb_cols

In [131]:
actions = [[0, 1, 0, 1], [0, 1, 0, 1]]
embeds = [[1,2,3,4], [1, 2,1, 1]]
kwargs = {'weight': [0.1, 0.2, 0.3, 0.4]}
cal_context(actions, embeds, weight_strategy, **kwargs)

[[0.6]
 [0.6]]


array([1. , 2. , 2. , 2.5])

In [4]:
action_files

{'20210502': 'part-00000-d412c602-2f4a-4649-a81f-e56435dd49fd-c000.csv',
 '20210423': 'part-00000-9809d73a-a55f-4ac2-a59b-9b83cbc5028e-c000.csv',
 '20210424': 'part-00000-225e55dc-4504-4c14-b289-322312355b2b-c000.csv',
 '20210421': 'part-00000-c15f29da-6b1e-48c0-b7d0-2cd560998c3f-c000.csv',
 '20210430': 'part-00000-2da4c3a0-2fcc-422d-8b7c-48940da315ad-c000.csv',
 '20210427': 'part-00000-9132ab46-51c3-4cc3-97de-e7ad5312b852-c000.csv',
 '20210419': 'part-00000-236b99d5-456a-42b2-bd8d-3cbd61d21cc6-c000.csv',
 '20210428': 'part-00000-fc8c8ca1-e655-4a45-b179-c8d9e2dd804c-c000.csv',
 '20210429': 'part-00000-c5dbd994-54d7-4734-adea-0f22d75b23d3-c000.csv',
 '20210422': 'part-00000-3d97d0f8-2572-45e6-bb60-f367c97e7870-c000.csv',
 '20210425': 'part-00000-9d23862b-6bbf-48c6-a598-572df1359737-c000.csv',
 '20210501': 'part-00000-56b6f0ff-57b8-41ac-96ec-c20e3306297e-c000.csv',
 '20210426': 'part-00000-0d315342-3ba7-4727-b4a2-123a1a004786-c000.csv',
 '20210420': 'part-00000-aad75aa4-b60b-4f5b-8def-c4

In [31]:
i = action_dirs[0]
tab1 = dt.fread(os.path.join(train_data_dir, f"history_behavior_data/{i}/{action_files[i]}"), sep="\t")

In [32]:
del tab1[:, 'watch_start_time']

In [33]:
af1 = tab1.to_pandas()

In [28]:
g = af1.groupby('user_id')

In [60]:
video = dt.fread(os.path.join(train_data_dir, "video_features_data/video_features.jay"))

In [144]:
video.key = 'video_id'

In [175]:
emb_cols = select_emb_cols_by_variance(video, threshold=0.015)
emb_cols

['desc_0',
 'desc_3',
 'desc_4',
 'desc_8',
 'desc_9',
 'tags_0',
 'tags_3',
 'tags_4',
 'tags_6',
 'tags_8',
 'class_0',
 'class_2',
 'class_5',
 'class_8',
 'da_0',
 'da_1',
 'da_2',
 'da_3',
 'da_4']

In [176]:
%%time
context1 = g.apply(lambda x: apply_one_user_one_day(x, video, emb_cols=emb_cols))

CPU times: user 1d 20h 16min 35s, sys: 3min 39s, total: 1d 20h 20min 15s
Wall time: 1h 58min 19s


In [246]:
np.array(context1.values.tolist()).shape

(664594, 19)

In [205]:
uids = context1.index
contexts = context1.values
contexts = np.array(contexts.tolist())

In [214]:
df = pd.DataFrame(contexts, columns=emb_cols)
df['user_id'] = uids
df

Unnamed: 0,desc_0,desc_3,desc_4,desc_8,desc_9,tags_0,tags_3,tags_4,tags_6,tags_8,class_0,class_2,class_5,class_8,da_0,da_1,da_2,da_3,da_4,user_id
0,0.029417,0.029415,0.029416,0.029418,0.735253,0.100000,0.100000,0.100000,0.100000,0.100000,0.315723,0.041771,0.041771,0.041778,0.324958,0.083541,0.083541,0.083541,0.424418,2
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,15
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,19
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,38
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664589,0.796570,0.016869,0.016871,0.068302,0.016957,0.037248,0.189083,0.037245,0.239687,0.037245,0.037486,0.037501,0.304189,0.395917,0.075851,0.075966,0.388781,0.383735,0.075666,5910767
664590,0.796570,0.016869,0.016871,0.068302,0.016957,0.037248,0.189083,0.037245,0.239687,0.037245,0.037486,0.037501,0.304189,0.395917,0.075851,0.075966,0.388781,0.383735,0.075666,5910778
664591,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5910789
664592,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5910794


In [216]:
action_dirs

['20210502',
 '20210423',
 '20210424',
 '20210421',
 '20210430',
 '20210427',
 '20210419',
 '20210428',
 '20210429',
 '20210422',
 '20210425',
 '20210501',
 '20210426',
 '20210420']

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


In [247]:
%%time
for d in action_dirs[1:]:
    t0 = time()
    f = action_files[d]
    tab = dt.fread(os.path.join(train_data_dir, f"history_behavior_data/{d}/{f}"), sep="\t")
    df = tab.to_pandas()
    g = df.groupby('user_id')
    context = g.apply(lambda x: apply_one_user_one_day(x, video, emb_cols=emb_cols))
    uids = context.index.tolist()
    data = context.values
    data = np.array(data.tolist())
    df = pd.DataFrame(data, columns=emb_cols)
    df['user_id'] = uids
    tab = dt.Frame(df)
    tab.to_jay(os.path.join(train_data_dir, f"user_context/{d}.jay"))
    print(f"{d}/{f} processed...\t\t({time() - t0})")
    t0 = time()

20210423/part-00000-9809d73a-a55f-4ac2-a59b-9b83cbc5028e-c000.csv processed...		(9990.520050764084)
20210424/part-00000-225e55dc-4504-4c14-b289-322312355b2b-c000.csv processed...		(6234.936888933182)
20210421/part-00000-c15f29da-6b1e-48c0-b7d0-2cd560998c3f-c000.csv processed...		(1962.8847260475159)
20210430/part-00000-2da4c3a0-2fcc-422d-8b7c-48940da315ad-c000.csv processed...		(1743.763418674469)
20210427/part-00000-9132ab46-51c3-4cc3-97de-e7ad5312b852-c000.csv processed...		(1212.925900220871)
20210419/part-00000-236b99d5-456a-42b2-bd8d-3cbd61d21cc6-c000.csv processed...		(1260.5789487361908)
20210428/part-00000-fc8c8ca1-e655-4a45-b179-c8d9e2dd804c-c000.csv processed...		(2153.1079547405243)
20210429/part-00000-c5dbd994-54d7-4734-adea-0f22d75b23d3-c000.csv processed...		(4843.6583869457245)
20210422/part-00000-3d97d0f8-2572-45e6-bb60-f367c97e7870-c000.csv processed...		(4087.833338737488)
20210425/part-00000-9d23862b-6bbf-48c6-a598-572df1359737-c000.csv processed...		(6606.4776954650

In [238]:
context.shape

(664594, 19)

In [236]:
tdf

Unnamed: 0,user_id,video_id,is_watch,is_share,is_collect,is_comment,watch_start_time,watch_label,pt_d
0,3382908,44786,False,False,False,False,,0,20210423
1,3382908,16813,False,False,False,False,,0,20210423
2,5155013,36965,False,False,False,False,,0,20210423
3,5155013,12968,False,False,False,False,,0,20210423
4,5155013,9742,False,False,False,False,,0,20210423
...,...,...,...,...,...,...,...,...,...
5900280,1197208,10806,True,False,False,False,2021-04-23,0,20210423
5900281,1197208,31020,True,False,False,False,2021-04-23,0,20210423
5900282,1197208,11319,True,False,False,False,2021-04-23,3,20210423
5900283,1197208,6343,True,False,False,False,2021-04-23,0,20210423


In [248]:
all_action = dt.fread(os.path.join(train_data_dir, "all_actions_with_ptd.jay"))
all_action

Unnamed: 0_level_0,user_id,video_id,is_watch,is_share,watch_label,pt_d
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,4239342,28149,1,0,2,20210502
1,3577036,115,1,0,0,20210502
2,5527504,3636,1,0,5,20210502
3,1117889,12968,1,0,0,20210502
4,1117889,860,1,0,4,20210502
5,1117889,39046,1,0,1,20210502
6,1117889,6693,1,0,0,20210502
7,3463198,30796,1,0,5,20210502
8,3463198,13511,1,0,0,20210502
9,807748,28149,1,0,0,20210502


In [266]:
dt.unique(all_action[dt.f.pt_d == 20210419, 'user_id'])

Unnamed: 0_level_0,user_id
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪
0,2
1,10
2,30
3,43
4,51
5,93
6,114
7,115
8,120
9,124


In [275]:
tab.cbind(dt.Frame({'pt_d': [20210420] * tab.shape[0]}))

In [277]:
tab.key = ('user_id', 'pt_d')

In [282]:
tmp = all_action[:, :, dt.join(tab)]

In [284]:
tmp[dt.f.pt_d == 20210420, :]

Unnamed: 0_level_0,user_id,video_id,is_watch,is_share,watch_label,pt_d,desc_0,desc_3,desc_4,desc_8,…,da_0,da_1,da_2,da_3,da_4
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,Unnamed: 11_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1628547,6693,1,0,0,20210420,0.0324835,0.0324818,0.0324878,0.0324872,…,0.238563,0.0737447,0.0737447,0.0737447,0.540203
1,1163144,17311,1,0,0,20210420,0.0274978,0.0275037,0.0275049,0.507654,…,0.649882,0.0841381,0.0873212,0.0946125,0.0840458
2,1163144,22598,1,0,9,20210420,0.0274978,0.0275037,0.0275049,0.507654,…,0.649882,0.0841381,0.0873212,0.0946125,0.0840458
3,376022,9994,1,0,1,20210420,0.0266068,0.0265962,0.0266032,0.026605,…,0.0863924,0.086406,0.089773,0.651886,0.0855428
4,5897052,12968,1,0,0,20210420,0.350774,0.0226458,0.0226486,0.022647,…,0.504303,0.0723026,0.0705481,0.0690034,0.283843
5,5182275,27138,1,0,2,20210420,0.0256326,0.0256264,0.0256326,0.769342,…,0.0845231,0.0850983,0.661279,0.0845425,0.0845573
6,3457796,12968,1,0,0,20210420,0.350774,0.0226458,0.0226486,0.022647,…,0.504303,0.0723026,0.0705481,0.0690034,0.283843
7,5568235,12968,1,0,0,20210420,0.350774,0.0226458,0.0226486,0.022647,…,0.504303,0.0723026,0.0705481,0.0690034,0.283843
8,5081645,5239,1,0,5,20210420,0.212675,0.0687802,0.490904,0.028394,…,0.163256,0.0753263,0.145666,0.0739688,0.541782
9,5081645,27010,1,0,2,20210420,0.212675,0.0687802,0.490904,0.028394,…,0.163256,0.0753263,0.145666,0.0739688,0.541782


In [292]:
dt.unique(all_action[:, dt.f.pt_d, dt.sort()]).to_pandas()['pt_d'].tolist()

[20210419,
 20210420,
 20210421,
 20210422,
 20210423,
 20210424,
 20210425,
 20210426,
 20210427,
 20210428,
 20210429,
 20210430,
 20210501,
 20210502]

In [293]:
pt_d = [20210419, 20210420, 20210421, 20210422, 20210423, 20210424, 20210425, 20210426, 20210427, 20210428, 20210429, 20210430, 20210501, 20210502]

In [294]:
context_data_dir = os.path.join(train_data_dir, 'user_context')
window_size = 3
weight = [.2, .3, .5]
for i, date in enumerate(pt_d):
    if i < window_size:
        start = 0
    else:
        start = i - window_size
    
    if i == 0:
        process_cold_start(all_action[dt.f.pt_d == date])
        continue
    end = i - 1
    tabs = None
    for j, d in enumerate(pt[start: end+1]):
        t_tab = dt.fread(os.path.join(context_data_dir, f"{d}.jay"))
        

14