In [38]:
import pandas as pd 
import os
import time 
import gc
from tqdm import tqdm
from utils import reduce_mem, uAUC, ProNE, HyperParam, get_logger
from sklearn.metrics import *
import torch
import logging
from gensim.models import word2vec
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA
import networkx as nx
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

## 训练集采样，减少样本

In [2]:
train = pd.read_pickle("data/train.pkl")
train['index'] = train.index
test = pd.read_pickle("data/test.pkl")
user_info = pd.read_pickle("data/user_info.pkl")
video_info = pd.read_pickle("data/video_info.pkl")
print(train.shape, test.shape, user_info.shape, video_info.shape)

(80276856, 9) (2822180, 2) (5910800, 8) (49731, 10)


In [3]:
train_pos = train[train['is_watch'] == 1]
train_neg = train[train['is_watch'] == 0]
print("正样本个数 {}, 负样本个数 {}".format(train_pos.shape[0], train_neg.shape[0]))

test_userid = test['user_id'].unique()
train_neg1 = train_neg[train_neg['user_id'].isin(test_userid)]
train_neg2 = train_neg[~train_neg['user_id'].isin(test_userid)]
train_neg2 = train_neg2.sample(frac=0.1, random_state=2021)
print("负样本个数：", train_neg1.shape, train_neg2.shape)


train = pd.concat([train_pos, train_neg1, train_neg2], ignore_index=True)
print(train.shape)

train.sort_values(by='index', inplace=True)
train['date_'] = train['pt_d'].map(dict(zip(list(train['pt_d'].unique()), range(1, 15))))
del train['index'], train['pt_d']

正样本个数 7353024, 负样本个数 72923832
负样本个数： (18726628, 9) (5419720, 9)
(31499372, 9)


In [4]:
train = train.reset_index(drop=True)
test['date_'] = 15
print(train.shape, test.shape)

(31499372, 8) (2822180, 3)


In [5]:
print("训练集用户数: {}, 测试集用户数: {}".format(train['user_id'].nunique(), test['user_id'].nunique()))
print("训练集和测试集用户重合数: {}".format(len(set(train['user_id'].unique()) & set( test['user_id'].unique()))))

print("训练集视频数: {}, 测试集视频数: {}".format(train['video_id'].nunique(), test['video_id'].nunique()))
print("训练集和测试集视频重合数: {}".format(len(set(train['video_id'].unique()) & set( test['video_id'].unique()))))

训练集用户数: 3170872, 测试集用户数: 492174
训练集和测试集用户重合数: 422143
训练集视频数: 30540, 测试集视频数: 23686
训练集和测试集视频重合数: 23364


## 视频信息表的处理

In [6]:
video_info = pd.read_pickle("data/video_info.pkl")
video_info['video_release_date'] = pd.to_datetime(video_info['video_release_date'])
## 发布的年份
video_info['video_release_year'] = video_info['video_release_date'].apply(lambda x: x.year)
## 发布至今有多少天了
video_info['video_release_ndays'] = pd.to_datetime('20210501') - video_info['video_release_date']
video_info['video_release_ndays'] = video_info['video_release_ndays'].apply(lambda x: x.days)

In [7]:
video_info = video_info[['video_id', 'video_score', 'video_second_class', 'video_duration',
                         'video_release_year', 'video_release_ndays']]
video_info['video_class'] = video_info['video_second_class'].fillna('剧情').apply(lambda x: x.split(',')[0].strip())
del video_info['video_second_class']

In [8]:
### 保存

train.to_pickle("data/features/train.pkl")
test.to_pickle("data/features/test.pkl")
video_info.to_pickle("data/features/video_info.pkl")
user_info.to_pickle("data/features/user_info.pkl")

## 特征工程

### word2vec特征

In [62]:
df = pd.concat([train, test], ignore_index=True)
print(df.shape)

del train, test
gc.collect()

df.columns = ['userid', 'feedid', 'is_watch', 'is_share', 'is_collect', 'is_comment', 'watch_label', 'date_']

(34321552, 8)


0

In [79]:
df['feedid'] = df['feedid'].astype(str)
user_dict = df.groupby('userid')['feedid'].agg(list)
user_fid_list = user_dict.values.tolist()
print("序列的个数: {}".format(len(user_fid_list)))

## 训练word2vec 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

序列的个数: 3240903


In [81]:
model = word2vec.Word2Vec(user_fid_list,
                          min_count=1, window=20, size=32, sg=1, workers=4, iter=5) 

2021-08-16 14:31:29,380 : INFO : collecting all words and their counts
2021-08-16 14:31:29,382 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-08-16 14:31:29,420 : INFO : PROGRESS: at sentence #10000, processed 105406 words, keeping 11622 word types
2021-08-16 14:31:29,471 : INFO : PROGRESS: at sentence #20000, processed 214615 words, keeping 14858 word types
2021-08-16 14:31:29,511 : INFO : PROGRESS: at sentence #30000, processed 317769 words, keeping 16570 word types
2021-08-16 14:31:29,554 : INFO : PROGRESS: at sentence #40000, processed 426139 words, keeping 17671 word types
2021-08-16 14:31:29,590 : INFO : PROGRESS: at sentence #50000, processed 530209 words, keeping 18500 word types
2021-08-16 14:31:29,629 : INFO : PROGRESS: at sentence #60000, processed 637890 words, keeping 19388 word types
2021-08-16 14:31:29,670 : INFO : PROGRESS: at sentence #70000, processed 739524 words, keeping 19820 word types
2021-08-16 14:31:29,705 : INFO : PROGRESS: at 

2021-08-16 14:31:32,198 : INFO : PROGRESS: at sentence #720000, processed 7590215 words, keeping 26238 word types
2021-08-16 14:31:32,240 : INFO : PROGRESS: at sentence #730000, processed 7692417 words, keeping 26276 word types
2021-08-16 14:31:32,282 : INFO : PROGRESS: at sentence #740000, processed 7802620 words, keeping 26320 word types
2021-08-16 14:31:32,324 : INFO : PROGRESS: at sentence #750000, processed 7907550 words, keeping 26352 word types
2021-08-16 14:31:32,358 : INFO : PROGRESS: at sentence #760000, processed 8015258 words, keeping 26399 word types
2021-08-16 14:31:32,393 : INFO : PROGRESS: at sentence #770000, processed 8118069 words, keeping 26441 word types
2021-08-16 14:31:32,432 : INFO : PROGRESS: at sentence #780000, processed 8223761 words, keeping 26483 word types
2021-08-16 14:31:32,468 : INFO : PROGRESS: at sentence #790000, processed 8323479 words, keeping 26527 word types
2021-08-16 14:31:32,504 : INFO : PROGRESS: at sentence #800000, processed 8428878 words,

2021-08-16 14:31:34,929 : INFO : PROGRESS: at sentence #1440000, processed 15216128 words, keeping 28361 word types
2021-08-16 14:31:34,969 : INFO : PROGRESS: at sentence #1450000, processed 15323234 words, keeping 28385 word types
2021-08-16 14:31:35,004 : INFO : PROGRESS: at sentence #1460000, processed 15428451 words, keeping 28406 word types
2021-08-16 14:31:35,043 : INFO : PROGRESS: at sentence #1470000, processed 15536980 words, keeping 28421 word types
2021-08-16 14:31:35,078 : INFO : PROGRESS: at sentence #1480000, processed 15644727 words, keeping 28439 word types
2021-08-16 14:31:35,115 : INFO : PROGRESS: at sentence #1490000, processed 15752985 words, keeping 28462 word types
2021-08-16 14:31:35,148 : INFO : PROGRESS: at sentence #1500000, processed 15854513 words, keeping 28481 word types
2021-08-16 14:31:35,190 : INFO : PROGRESS: at sentence #1510000, processed 15974920 words, keeping 28510 word types
2021-08-16 14:31:35,231 : INFO : PROGRESS: at sentence #1520000, process

2021-08-16 14:31:37,672 : INFO : PROGRESS: at sentence #2150000, processed 22811897 words, keeping 29568 word types
2021-08-16 14:31:37,707 : INFO : PROGRESS: at sentence #2160000, processed 22926771 words, keeping 29587 word types
2021-08-16 14:31:37,748 : INFO : PROGRESS: at sentence #2170000, processed 23028868 words, keeping 29602 word types
2021-08-16 14:31:37,784 : INFO : PROGRESS: at sentence #2180000, processed 23137388 words, keeping 29610 word types
2021-08-16 14:31:37,824 : INFO : PROGRESS: at sentence #2190000, processed 23241133 words, keeping 29625 word types
2021-08-16 14:31:37,860 : INFO : PROGRESS: at sentence #2200000, processed 23343598 words, keeping 29627 word types
2021-08-16 14:31:37,901 : INFO : PROGRESS: at sentence #2210000, processed 23446369 words, keeping 29644 word types
2021-08-16 14:31:37,938 : INFO : PROGRESS: at sentence #2220000, processed 23555504 words, keeping 29660 word types
2021-08-16 14:31:37,975 : INFO : PROGRESS: at sentence #2230000, process

2021-08-16 14:32:07,137 : INFO : EPOCH 1 - PROGRESS: at 18.97% examples, 239807 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:08,173 : INFO : EPOCH 1 - PROGRESS: at 19.67% examples, 238950 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:09,248 : INFO : EPOCH 1 - PROGRESS: at 20.48% examples, 239031 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:10,257 : INFO : EPOCH 1 - PROGRESS: at 21.38% examples, 240468 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:11,267 : INFO : EPOCH 1 - PROGRESS: at 22.12% examples, 239885 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:12,295 : INFO : EPOCH 1 - PROGRESS: at 22.94% examples, 240111 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:13,381 : INFO : EPOCH 1 - PROGRESS: at 23.78% examples, 240165 words/s, in_qsize 8, out_qsize 0
2021-08-16 14:32:14,400 : INFO : EPOCH 1 - PROGRESS: at 24.67% examples, 240967 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:32:15,412 : INFO : EPOCH 1 - PROGRESS: at 25.53% examples, 241482 words/s, in_qsiz

2021-08-16 14:33:22,193 : INFO : EPOCH 1 - PROGRESS: at 82.92% examples, 260568 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:23,248 : INFO : EPOCH 1 - PROGRESS: at 83.90% examples, 260695 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:24,252 : INFO : EPOCH 1 - PROGRESS: at 84.78% examples, 260762 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:25,288 : INFO : EPOCH 1 - PROGRESS: at 85.63% examples, 260679 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:26,298 : INFO : EPOCH 1 - PROGRESS: at 86.58% examples, 260922 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:27,340 : INFO : EPOCH 1 - PROGRESS: at 87.49% examples, 261174 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:28,345 : INFO : EPOCH 1 - PROGRESS: at 88.36% examples, 261329 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:33:29,368 : INFO : EPOCH 1 - PROGRESS: at 89.17% examples, 261276 words/s, in_qsize 6, out_qsize 1
2021-08-16 14:33:30,398 : INFO : EPOCH 1 - PROGRESS: at 90.05% examples, 261297 words/s, in_qsiz

2021-08-16 14:34:33,639 : INFO : EPOCH 2 - PROGRESS: at 50.54% examples, 300577 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:34,701 : INFO : EPOCH 2 - PROGRESS: at 51.56% examples, 300711 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:35,730 : INFO : EPOCH 2 - PROGRESS: at 52.62% examples, 300974 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:36,735 : INFO : EPOCH 2 - PROGRESS: at 53.65% examples, 301188 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:37,738 : INFO : EPOCH 2 - PROGRESS: at 54.63% examples, 301282 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:38,744 : INFO : EPOCH 2 - PROGRESS: at 55.60% examples, 301360 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:39,767 : INFO : EPOCH 2 - PROGRESS: at 56.57% examples, 301324 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:40,779 : INFO : EPOCH 2 - PROGRESS: at 57.54% examples, 301537 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:34:41,803 : INFO : EPOCH 2 - PROGRESS: at 58.49% examples, 301377 words/s, in_qsiz

2021-08-16 14:35:44,604 : INFO : EPOCH 3 - PROGRESS: at 20.10% examples, 303764 words/s, in_qsize 8, out_qsize 0
2021-08-16 14:35:45,630 : INFO : EPOCH 3 - PROGRESS: at 21.09% examples, 303856 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:46,667 : INFO : EPOCH 3 - PROGRESS: at 22.12% examples, 303932 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:47,675 : INFO : EPOCH 3 - PROGRESS: at 23.17% examples, 304725 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:48,679 : INFO : EPOCH 3 - PROGRESS: at 24.13% examples, 304407 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:49,710 : INFO : EPOCH 3 - PROGRESS: at 25.21% examples, 304870 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:50,736 : INFO : EPOCH 3 - PROGRESS: at 26.21% examples, 304975 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:51,742 : INFO : EPOCH 3 - PROGRESS: at 27.22% examples, 305355 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:35:52,757 : INFO : EPOCH 3 - PROGRESS: at 28.28% examples, 305573 words/s, in_qsiz

2021-08-16 14:36:59,357 : INFO : EPOCH 3 - PROGRESS: at 92.28% examples, 302907 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:00,375 : INFO : EPOCH 3 - PROGRESS: at 93.26% examples, 303005 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:01,410 : INFO : EPOCH 3 - PROGRESS: at 94.30% examples, 303125 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:02,424 : INFO : EPOCH 3 - PROGRESS: at 95.34% examples, 303208 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:03,462 : INFO : EPOCH 3 - PROGRESS: at 96.34% examples, 303131 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:04,464 : INFO : EPOCH 3 - PROGRESS: at 97.35% examples, 303222 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:05,487 : INFO : EPOCH 3 - PROGRESS: at 98.31% examples, 303101 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:06,525 : INFO : EPOCH 3 - PROGRESS: at 99.37% examples, 303199 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:37:07,039 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-

2021-08-16 14:38:11,249 : INFO : EPOCH 4 - PROGRESS: at 61.00% examples, 298212 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:12,254 : INFO : EPOCH 4 - PROGRESS: at 61.93% examples, 298179 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:13,275 : INFO : EPOCH 4 - PROGRESS: at 63.00% examples, 298473 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:14,300 : INFO : EPOCH 4 - PROGRESS: at 63.98% examples, 298499 words/s, in_qsize 8, out_qsize 0
2021-08-16 14:38:15,318 : INFO : EPOCH 4 - PROGRESS: at 64.87% examples, 298252 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:16,355 : INFO : EPOCH 4 - PROGRESS: at 65.69% examples, 297690 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:17,380 : INFO : EPOCH 4 - PROGRESS: at 66.60% examples, 297477 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:18,401 : INFO : EPOCH 4 - PROGRESS: at 67.55% examples, 297283 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:38:19,413 : INFO : EPOCH 4 - PROGRESS: at 68.46% examples, 297009 words/s, in_qsiz

2021-08-16 14:39:22,592 : INFO : EPOCH 5 - PROGRESS: at 25.99% examples, 290975 words/s, in_qsize 7, out_qsize 2
2021-08-16 14:39:23,675 : INFO : EPOCH 5 - PROGRESS: at 27.01% examples, 291369 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:24,733 : INFO : EPOCH 5 - PROGRESS: at 28.10% examples, 291929 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:25,779 : INFO : EPOCH 5 - PROGRESS: at 29.13% examples, 292483 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:26,781 : INFO : EPOCH 5 - PROGRESS: at 30.11% examples, 292928 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:27,782 : INFO : EPOCH 5 - PROGRESS: at 31.10% examples, 293305 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:28,881 : INFO : EPOCH 5 - PROGRESS: at 32.17% examples, 293131 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:39:29,916 : INFO : EPOCH 5 - PROGRESS: at 33.08% examples, 292724 words/s, in_qsize 6, out_qsize 1
2021-08-16 14:39:30,926 : INFO : EPOCH 5 - PROGRESS: at 34.00% examples, 292533 words/s, in_qsiz

2021-08-16 14:40:37,810 : INFO : EPOCH 5 - PROGRESS: at 95.51% examples, 290294 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:40:38,890 : INFO : EPOCH 5 - PROGRESS: at 96.56% examples, 290398 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:40:39,950 : INFO : EPOCH 5 - PROGRESS: at 97.60% examples, 290540 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:40:40,958 : INFO : EPOCH 5 - PROGRESS: at 98.59% examples, 290674 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:40:41,965 : INFO : EPOCH 5 - PROGRESS: at 99.62% examples, 290795 words/s, in_qsize 7, out_qsize 0
2021-08-16 14:40:42,260 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-08-16 14:40:42,341 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-08-16 14:40:42,361 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-08-16 14:40:42,382 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-08-16 14:40:42,383 : INFO : EPOCH - 5 : training on 34321552 raw w

In [92]:
## 将每个feedid的向量保存为pickle
emb_size = 32
feed_emb = pd.DataFrame({'feedid': df['feedid'].unique()})
print(feed_emb.shape)

w2v_fid_mat = []
null_cnt = 0
for fid in feed_emb.feedid.values:
    try:
        emb = model.wv[fid]
    except:
        emb = np.zeros(emb_size)
        null_cnt += 1
    w2v_fid_mat.append(emb)
w2v_fid_mat = np.array(w2v_fid_mat, dtype=np.float32)
print("unknown nums: {}".format(null_cnt))

fid_w2v_emb = pd.concat([feed_emb, pd.DataFrame(w2v_fid_mat, 
                                                columns=['fid_w2v_emb{}'.format(i) for i in range(emb_size)])], 
                        axis=1)
fid_w2v_emb['feedid'] = fid_w2v_emb['feedid'].astype(int)
fid_w2v_emb.to_pickle("data/features/fid_w2v_emb.pkl")

(30862, 1)
unknown nums: 0


### 行为序列的tfidf-svd特征

In [100]:
df['feedid'] = df['feedid'].astype(int)

In [101]:
def tfidf_svd(df, f1, f2, n_components=32):
    tmp     = df.groupby(f1, as_index=False)[f2].agg({'list': lambda x: ' '.join(list(x.astype('str')))})
    tfidf   = TfidfVectorizer(max_df=0.95, min_df=3, sublinear_tf=True)
    res     = tfidf.fit_transform(tmp['list']) 
    print('svd start')
    svd     = TruncatedSVD(n_components=n_components, random_state=2021)
    svd_res = svd.fit_transform(res)
    print('svd finished')
    for i in (range(n_components)):
        tmp['{}_{}_tfidf_svd_{}'.format(f1, f2, i)] = svd_res[:, i]
        tmp['{}_{}_tfidf_svd_{}'.format(f1, f2, i)] = tmp['{}_{}_tfidf_svd_{}'.format(f1, f2, i)].astype(np.float32)
    del tmp['list']
    return tmp

In [103]:
%%time

fid_tfidf_svd_emb = tfidf_svd(df, 'feedid', 'userid')
uid_tfidf_svd_emb = tfidf_svd(df, 'userid', 'feedid')

fid_tfidf_svd_emb.to_pickle("data/features/fid_tfidf_svd_emb.pkl")
uid_tfidf_svd_emb.to_pickle("data/features/uid_tfidf_svd_emb.pkl")

svd start
svd finished
svd start
svd finished
CPU times: user 9min 59s, sys: 15.5 s, total: 10min 15s
Wall time: 9min 17s


### 统计特征和CTR特征

In [46]:
train = pd.read_pickle("data/features/train.pkl")
test = pd.read_pickle("data/features/test.pkl")
user_info = pd.read_pickle("data/features/user_info.pkl")
video_info = pd.read_pickle("data/features/video_info.pkl")

df = pd.concat([train, test], ignore_index=True)
df.rename(columns={'user_id': 'userid', 'video_id': 'feedid'}, inplace=True)

del train, test
gc.collect()
print(df.shape)

user_info = reduce_mem(user_info, user_info.columns)
user_info.rename(columns={'user_id': 'userid'}, inplace=True)


video_info['video_score'] = video_info['video_score'].fillna(video_info['video_score'].mean())
video_info['video_release_year'] = video_info['video_release_year'].fillna(video_info['video_release_year'].mean())
video_info['video_release_ndays'] = video_info['video_release_ndays'].fillna(video_info['video_release_ndays'].mean())
video_info['video_class'] = LabelEncoder().fit_transform(video_info['video_class'])
video_info = reduce_mem(video_info, video_info.columns)
video_info.rename(columns={'video_id': 'feedid'}, inplace=True)


df = df.merge(user_info, how='left', on='userid')
df = df.merge(video_info, how='left', on='feedid')


y_list = ['is_watch', 'is_share', 'is_collect', 'is_comment', 'watch_label']

df = reduce_mem(df, cols=[col for col in df.columns if col not in y_list])

  0%|          | 0/8 [00:00<?, ?it/s]

(34321552, 8)


100%|██████████| 8/8 [00:01<00:00,  7.87it/s]
100%|██████████| 6/6 [00:00<00:00, 699.38it/s]


360.77 Mb, 73.28 Mb (79.69 %)
2.28 Mb, 0.62 Mb (72.91 %)


100%|██████████| 15/15 [00:02<00:00,  5.95it/s]

3371.35 Mb, 2487.60 Mb (26.21 %)





In [47]:
df['video_class'] = df['video_class'].fillna(14.0)
df['video_class'] = df['video_class'].astype(np.int32)

for col in ['video_score', 'video_duration', 'video_release_year', 'video_release_ndays']:
    df[col] = df[col].fillna(df[col].astype(np.float32).mean())

In [48]:
df.head()

Unnamed: 0,userid,feedid,is_watch,is_share,is_collect,is_comment,watch_label,date_,age,gender,country,province,city,city_level,device_name,video_score,video_duration,video_release_year,video_release_ndays,video_class
0,214949,23241,0.0,0.0,0.0,0.0,0.0,1,5,0,0,5,54,0,219,7.699219,5376.0,2021.0,23.0,27
1,3364496,23047,0.0,0.0,0.0,0.0,0.0,1,2,0,0,8,163,1,249,6.601562,4960.0,2016.0,1673.0,16
2,3364496,45506,0.0,0.0,0.0,0.0,0.0,1,2,0,0,8,163,1,249,6.621094,4244.0,1008.5,3144.0,14
3,3364496,23116,0.0,0.0,0.0,0.0,0.0,1,2,0,0,8,163,1,249,7.398438,6432.0,2017.0,1359.0,15
4,3364496,43058,0.0,0.0,0.0,0.0,0.0,1,2,0,0,8,163,1,249,7.199219,1185.0,2001.0,7428.0,14


In [49]:
## 统计历史5天的曝光、转化、视频观看等情况（此处的转化率统计其实就是target encoding）
n_day = 5
max_day = 15
start_time = time.time()


for stat_cols in ([
    ['userid'], ['feedid'], ['age'], ['gender'], ['city'], ['city_level'], ['video_class'],
    ['feedid', 'age'],  ['feedid', 'gender'], ['feedid', 'city']]):
    
    f = '_'.join(stat_cols)
    print('======== ' + f + ' =========')
    stat_df = pd.DataFrame()
    for target_day in range(2, max_day + 1):
        left, right = max(target_day - n_day, 1), target_day - 1
        
        tmp = df[((df['date_'] >= left) & (df['date_'] <= right))].reset_index(drop=True)
        tmp['date_'] = target_day
        tmp['{}_{}day_count'.format(f, n_day)] = tmp.groupby(stat_cols)['date_'].transform('count')
        
        g = tmp.groupby(stat_cols)
        
        # 特征列
        feats = ['{}_{}day_count'.format(f, n_day)]

        for y in y_list:
            tmp['{}_{}day_{}_sum'.format(f, n_day, y)] = g[y].transform('sum')
            tmp['{}_{}day_{}_mean'.format(f, n_day, y)] = g[y].transform('mean')
            feats.extend(['{}_{}day_{}_sum'.format(f, n_day, y), '{}_{}day_{}_mean'.format(f, n_day, y)])
        
        tmp = tmp[stat_cols + feats + ['date_']].drop_duplicates(stat_cols + ['date_']).reset_index(drop=True)
        stat_df = pd.concat([stat_df, tmp], axis=0, ignore_index=True)
        del g, tmp
    
    stat_df = reduce_mem(stat_df, [f for f in stat_df.columns if f not in stat_cols + ['date_'] + y_list])
    df = df.merge(stat_df, on=stat_cols + ['date_'], how='left')
    del stat_df
    gc.collect()
    print("time costed: {}(s)".format(round(time.time() - start_time, 2)))



100%|██████████| 11/11 [00:06<00:00,  1.80it/s]


1873.23 Mb, 636.90 Mb (66.00 %)
time costed: 125.56


100%|██████████| 11/11 [00:00<00:00, 134.37it/s]


35.23 Mb, 14.09 Mb (60.00 %)
time costed: 211.8


100%|██████████| 11/11 [00:00<00:00, 1551.51it/s]


0.01 Mb, 0.00 Mb (61.03 %)
time costed: 307.14


100%|██████████| 11/11 [00:00<00:00, 1326.20it/s]


0.00 Mb, 0.00 Mb (59.97 %)
time costed: 435.22


100%|██████████| 11/11 [00:00<00:00, 1160.10it/s]


0.44 Mb, 0.16 Mb (63.25 %)
time costed: 594.14


100%|██████████| 11/11 [00:00<00:00, 2287.88it/s]


0.01 Mb, 0.00 Mb (61.03 %)
time costed: 799.68


100%|██████████| 11/11 [00:00<00:00, 1236.53it/s]


0.11 Mb, 0.04 Mb (59.93 %)
time costed: 1049.29


100%|██████████| 11/11 [00:00<00:00, 27.42it/s]


181.03 Mb, 73.49 Mb (59.41 %)
time costed: 1378.6


100%|██████████| 11/11 [00:00<00:00, 67.18it/s]


77.24 Mb, 31.35 Mb (59.41 %)
time costed: 1764.02


100%|██████████| 11/11 [00:08<00:00,  1.35it/s]


2434.13 Mb, 859.10 Mb (64.71 %)
time costed: 2328.71


In [51]:
df.shape

(34321552, 130)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34321552 entries, 0 to 34321551
Columns: 130 entries, userid to feedid_city_5day_watch_label_mean
dtypes: float16(89), float32(15), float64(15), int16(2), int32(3), int8(6)
memory usage: 12.4 GB


In [55]:
# df.to_pickle("data/features/df_v1.pkl")

In [54]:
df.to_feather("data/features/df_v1.feather")