In [2]:
from utils import *



# 数据处理

## 用户特征处理  
> (1)对部分特征编码  
>- age：作为有序多分类特征，保持原样。
>- gender: 无序多分类特征，采用one-hot编码
>- country：无序多分类特征，且分类特别少，采用one-hot编码
>- provience：无序多分类特征，分类数较多，采用Frequency编码
>- city：无序多分类特征，分类数较多，采用target编码
>- city_level：有序多分类特征，保持。
>- device_name：无序多分类特征，分类数较多，采用target编码。   

> (2)追加部分统计特征(以1,3,7,14天为时间窗口)  
>- 期限内用户观看视频次数
>- 期限内用户观看视频部数
>- 用户评论视频次数
>- 用户收藏视频次数
>- 用户分享视频数量

> (3)对追加的统计特征做数据平滑,以减小统计的误差。

In [2]:
user_df = load_user()

In [3]:
user_df

Unnamed: 0,user_id,age,gender,country,province,city,city_level,device_name
0,1757005,3,1,0,9,6,3,327
1,17938,0,0,0,4,22,3,327
2,4263520,1,0,0,19,1,5,327
3,1411600,3,0,0,5,138,1,327
4,3992242,2,0,0,0,142,0,327
...,...,...,...,...,...,...,...,...
5910795,3223427,4,0,0,3,3,3,28
5910796,4707826,4,0,0,17,249,1,28
5910797,5907653,0,0,0,11,65,0,28
5910798,3633224,3,0,0,2,57,1,28


### 特征编码

#### age
>原数据为分段编码，有序多分类，保持原样。

#### gender
>性别，采用one-hot编码

In [4]:
col_name = 'gender'
df = user_df[col_name]
new_cols = pd.get_dummies(df)
new_col_names = new_cols.columns.values
new_cols.columns = [f"{col_name}_{col}" for col in new_col_names]
user_df = pd.concat([user_df, new_cols], axis=1)
user_df.drop([col_name], axis=1, inplace=True)
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5910800 entries, 0 to 5910799
Data columns (total 11 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int32
 1   age          int32
 2   country      int32
 3   province     int32
 4   city         int32
 5   city_level   int32
 6   device_name  int32
 7   gender_0     uint8
 8   gender_1     uint8
 9   gender_2     uint8
 10  gender_3     uint8
dtypes: int32(7), uint8(4)
memory usage: 180.4 MB


#### country
>无序多分类特征，且分类只有3个，采用one-hot编码

In [5]:
col_name = 'country'
df = user_df[col_name]
new_cols = pd.get_dummies(df)
new_col_names = new_cols.columns.values
new_cols.columns = [f"{col_name}_{col}" for col in new_col_names]
user_df = pd.concat([user_df, new_cols], axis=1)
user_df.drop([col_name], axis=1, inplace=True)
user_df.head()

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,9,6,3,327,0,1,0,0,1,0,0
1,17938,0,4,22,3,327,1,0,0,0,1,0,0
2,4263520,1,19,1,5,327,1,0,0,0,1,0,0
3,1411600,3,5,138,1,327,1,0,0,0,1,0,0
4,3992242,2,0,142,0,327,1,0,0,0,1,0,0


In [6]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5910800 entries, 0 to 5910799
Data columns (total 13 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int32
 1   age          int32
 2   province     int32
 3   city         int32
 4   city_level   int32
 5   device_name  int32
 6   gender_0     uint8
 7   gender_1     uint8
 8   gender_2     uint8
 9   gender_3     uint8
 10  country_0    uint8
 11  country_1    uint8
 12  country_2    uint8
dtypes: int32(6), uint8(7)
memory usage: 174.7 MB


#### provience
>无需多分类特征，分类数较多，所以采用频率编码，再降维,Frequency编码通过计算特征变量中每个值的出现次数来表示该特征的信息。

In [7]:
user_df

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,9,6,3,327,0,1,0,0,1,0,0
1,17938,0,4,22,3,327,1,0,0,0,1,0,0
2,4263520,1,19,1,5,327,1,0,0,0,1,0,0
3,1411600,3,5,138,1,327,1,0,0,0,1,0,0
4,3992242,2,0,142,0,327,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5910795,3223427,4,3,3,3,28,1,0,0,0,1,0,0
5910796,4707826,4,17,249,1,28,1,0,0,0,1,0,0
5910797,5907653,0,11,65,0,28,1,0,0,0,1,0,0
5910798,3633224,3,2,57,1,28,1,0,0,0,1,0,0


In [8]:
col_name = 'province'
user_df[col_name] = user_df[col_name].map(user_df[col_name].value_counts())

In [9]:
user_df.head()

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,229863,6,3,327,0,1,0,0,1,0,0
1,17938,0,341554,22,3,327,1,0,0,0,1,0,0
2,4263520,1,120423,1,5,327,1,0,0,0,1,0,0
3,1411600,3,314573,138,1,327,1,0,0,0,1,0,0
4,3992242,2,630065,142,0,327,1,0,0,0,1,0,0


#### city
>无序多分类特征，分类数达339，采用frequency

In [10]:
col_name = 'city'
user_df[col_name] = user_df[col_name].map(user_df[col_name].value_counts())
user_df.head()

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,229863,96162,3,327,0,1,0,0,1,0,0
1,17938,0,341554,45486,3,327,1,0,0,0,1,0,0
2,4263520,1,120423,120423,5,327,1,0,0,0,1,0,0
3,1411600,3,314573,14566,1,327,1,0,0,0,1,0,0
4,3992242,2,630065,14379,0,327,1,0,0,0,1,0,0


#### city_level
>有序多分类变量，保持

#### device_name
>无序多分类变量，类别基数大，采用target编码:LeaveOneOutEncoder  
参考资料：https://axk51013.medium.com/kaggle-categorical-encoding-3%E5%A4%A7%E7%B5%95%E6%8B%9B-589780119470

- 需采用交叉验证，目前用Frequency代替
col_name = 'device_name'
loo = LeaveOneOutEncoder()
loo.fit_transform(df_tr['color'], df_tr['label'])

In [11]:
user_df

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,229863,96162,3,327,0,1,0,0,1,0,0
1,17938,0,341554,45486,3,327,1,0,0,0,1,0,0
2,4263520,1,120423,120423,5,327,1,0,0,0,1,0,0
3,1411600,3,314573,14566,1,327,1,0,0,0,1,0,0
4,3992242,2,630065,14379,0,327,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5910795,3223427,4,343991,113444,3,28,1,0,0,0,1,0,0
5910796,4707826,4,138629,6908,1,28,1,0,0,0,1,0,0
5910797,5907653,0,206316,23276,0,28,1,0,0,0,1,0,0
5910798,3633224,3,437480,25884,1,28,1,0,0,0,1,0,0


使用Frequency方法代替Leave_one_out对device_name编码。

In [12]:
col_name = 'device_name'
user_df[col_name] = user_df[col_name].map(user_df[col_name].value_counts())
user_df.head()

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2
0,1757005,3,229863,96162,3,1022,0,1,0,0,1,0,0
1,17938,0,341554,45486,3,1022,1,0,0,0,1,0,0
2,4263520,1,120423,120423,5,1022,1,0,0,0,1,0,0
3,1411600,3,314573,14566,1,1022,1,0,0,0,1,0,0
4,3992242,2,630065,14379,0,1022,1,0,0,0,1,0,0


In [13]:
# save_user_data(user_df, "user_data_v2", "jay")

## 统计特征追加 

In [1]:
from utils import * 



In [2]:
%%time
actions_df = load_actions(all_features = True)

CPU times: user 25.3 s, sys: 1.71 s, total: 27 s
Wall time: 7.94 s


In [3]:
actions_df.head()

Unnamed: 0,user_id,video_id,is_watch,is_share,is_collect,is_comment,watch_start_time,watch_label,pt_d
0,3672407,38350,False,False,False,False,,0,20210427
1,3080901,11907,False,False,False,False,,0,20210427
2,3528503,28411,False,False,False,False,,0,20210427
3,3528503,15070,False,False,False,False,,0,20210427
4,3528503,38350,False,False,False,False,,0,20210427


### 以用户分类

In [7]:
# 加载用户原表
user_df = load_user_modified()

In [8]:
user_groups = actions_df.groupby(['user_id'])

#### 用户14天平均watch_label

In [9]:
average_watch_label = user_groups['watch_label'].agg(['mean'])

In [10]:
average_watch_label = pd.DataFrame(average_watch_label)

In [11]:
average_watch_label = renameCol(average_watch_label, 'mean', 'average_watch_label')

In [12]:
average_watch_label

Unnamed: 0_level_0,average_watch_label
user_id,Unnamed: 1_level_1
2,0.368421
4,0.538462
5,0.000000
6,0.000000
7,0.269231
...,...
5910793,0.000000
5910794,0.029412
5910795,0.000000
5910797,0.000000


In [14]:
user_df = pd.merge(user_df, average_watch_label, on='user_id', how='left')
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910800 entries, 0 to 5910799
Data columns (total 14 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int32  
 1   age                  int32  
 2   province             int32  
 3   city                 int32  
 4   city_level           int32  
 5   device_name          int32  
 6   gender_0             bool   
 7   gender_1             bool   
 8   gender_2             bool   
 9   gender_3             bool   
 10  country_0            bool   
 11  country_1            bool   
 12  country_2            bool   
 13  average_watch_label  float64
dtypes: bool(7), float64(1), int32(6)
memory usage: 264.9 MB


#### 用户14天观看视频次数

In [15]:
sum_watch_times = user_groups['is_watch'].agg(['sum'])

In [16]:
sum_watch_times = pd.DataFrame(sum_watch_times)

In [17]:
sum_watch_times = renameCol(sum_watch_times, 'sum', 'sum_watch_times')

In [18]:
sum_watch_times

Unnamed: 0_level_0,sum_watch_times
user_id,Unnamed: 1_level_1
2,15
4,1
5,0
6,0
7,2
...,...
5910793,1
5910794,2
5910795,0
5910797,1


In [19]:
user_df = pd.merge(user_df, sum_watch_times, on='user_id', how='left')
user_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910800 entries, 0 to 5910799
Data columns (total 15 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int32  
 1   age                  int32  
 2   province             int32  
 3   city                 int32  
 4   city_level           int32  
 5   device_name          int32  
 6   gender_0             bool   
 7   gender_1             bool   
 8   gender_2             bool   
 9   gender_3             bool   
 10  country_0            bool   
 11  country_1            bool   
 12  country_2            bool   
 13  average_watch_label  float64
 14  sum_watch_times      float64
dtypes: bool(7), float64(2), int32(6)
memory usage: 310.0 MB


In [20]:
user_df.astype('float64')

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2,average_watch_label,sum_watch_times
0,1757005.0,3.0,229863.0,96162.0,3.0,1022.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,,
1,17938.0,0.0,341554.0,45486.0,3.0,1022.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.096774,3.0
2,4263520.0,1.0,120423.0,120423.0,5.0,1022.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.204545,2.0
3,1411600.0,3.0,314573.0,14566.0,1.0,1022.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,,
4,3992242.0,2.0,630065.0,14379.0,0.0,1022.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5910795,3223427.0,4.0,343991.0,113444.0,3.0,53566.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,,
5910796,4707826.0,4.0,138629.0,6908.0,1.0,53566.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.142857,3.0
5910797,5907653.0,0.0,206316.0,23276.0,0.0,53566.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,2.0
5910798,3633224.0,3.0,437480.0,25884.0,1.0,53566.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,,


#### 用户14天完播视频次数
> 完播：watch_label>=8，播放80%视频时间。  
! 太慢了，先不做

In [None]:
user_groups['watch_label'].apply(lambda x: sum(x >= 8))

In [22]:
sum(user_groups['watch_label'][:]>=8)

IndexError: Column(s) watch_label already selected

In [None]:
user_groups['watch_label'].apply(lambda x: sum(x >= 8))

#### 用户14天跳过视频次数
> watch_label = 0

In [74]:
user_groups['watch_label'].apply(lambda x: sum(x == 0))

KeyboardInterrupt: 

#### 用户14天评论视频次数

In [21]:
sum_comment_times = user_groups['is_comment'].agg(['sum'])

In [22]:
sum_comment_times = pd.DataFrame(sum_comment_times)

In [23]:
sum_comment_times = renameCol(sum_comment_times, 'sum', 'sum_comment_times')

In [24]:
sum_comment_times

Unnamed: 0_level_0,sum_comment_times
user_id,Unnamed: 1_level_1
2,0
4,0
5,0
6,0
7,0
...,...
5910793,0
5910794,0
5910795,0
5910797,0


In [25]:
user_df = pd.merge(user_df, sum_comment_times, on='user_id', how='left')
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910800 entries, 0 to 5910799
Data columns (total 16 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              int32  
 1   age                  int32  
 2   province             int32  
 3   city                 int32  
 4   city_level           int32  
 5   device_name          int32  
 6   gender_0             bool   
 7   gender_1             bool   
 8   gender_2             bool   
 9   gender_3             bool   
 10  country_0            bool   
 11  country_1            bool   
 12  country_2            bool   
 13  average_watch_label  float64
 14  sum_watch_times      float64
 15  sum_comment_times    float64
dtypes: bool(7), float64(3), int32(6)
memory usage: 355.1 MB


In [26]:
user_df = user_df.astype('float64')

#### 用户14天收藏视频次数

In [27]:
sum_collect_times = user_groups['is_collect'].agg(['sum'])

In [28]:
sum_collect_times = pd.DataFrame(sum_collect_times)

In [29]:
sum_collect_times = renameCol(sum_collect_times, 'sum', 'sum_collect_times')

In [31]:
sum_collect_times

Unnamed: 0_level_0,sum_collect_times
user_id,Unnamed: 1_level_1
2,0
4,0
5,0
6,0
7,0
...,...
5910793,0
5910794,0
5910795,0
5910797,0


In [32]:
user_df = pd.merge(user_df, sum_collect_times, on='user_id', how='left')
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910800 entries, 0 to 5910799
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              float64
 1   age                  float64
 2   province             float64
 3   city                 float64
 4   city_level           float64
 5   device_name          float64
 6   gender_0             float64
 7   gender_1             float64
 8   gender_2             float64
 9   gender_3             float64
 10  country_0            float64
 11  country_1            float64
 12  country_2            float64
 13  average_watch_label  float64
 14  sum_watch_times      float64
 15  sum_comment_times    float64
 16  sum_collect_times    float64
dtypes: float64(17)
memory usage: 811.7 MB


#### 用户14天分享视频次数

In [33]:
sum_share_times = user_groups['is_share'].agg(['sum'])

In [34]:
sum_share_times = pd.DataFrame(sum_share_times)

In [35]:
sum_share_times = renameCol(sum_share_times, 'sum', 'sum_share_times')

In [36]:
sum_share_times

Unnamed: 0_level_0,sum_share_times
user_id,Unnamed: 1_level_1
2,0
4,0
5,0
6,0
7,0
...,...
5910793,0
5910794,0
5910795,0
5910797,0


In [37]:
user_df = pd.merge(user_df, sum_share_times, on='user_id', how='left')
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5910800 entries, 0 to 5910799
Data columns (total 18 columns):
 #   Column               Dtype  
---  ------               -----  
 0   user_id              float64
 1   age                  float64
 2   province             float64
 3   city                 float64
 4   city_level           float64
 5   device_name          float64
 6   gender_0             float64
 7   gender_1             float64
 8   gender_2             float64
 9   gender_3             float64
 10  country_0            float64
 11  country_1            float64
 12  country_2            float64
 13  average_watch_label  float64
 14  sum_watch_times      float64
 15  sum_comment_times    float64
 16  sum_collect_times    float64
 17  sum_share_times      float64
dtypes: float64(18)
memory usage: 856.8 MB


In [38]:
user_df.describe()

Unnamed: 0,user_id,age,province,city,city_level,device_name,gender_0,gender_1,gender_2,gender_3,country_0,country_1,country_2,average_watch_label,sum_watch_times,sum_comment_times,sum_collect_times,sum_share_times
count,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,5910800.0,3953209.0,3953209.0,3953209.0,3953209.0,3953209.0
mean,2955400.0,1.886208,287473.5,39529.89,2.056737,44937.2,0.726119,0.2633371,0.009718312,0.0008256074,0.9991373,0.0008415105,2.114773e-05,0.1216434,1.860014,0.003147317,0.02330233,0.003781738
std,1706301.0,1.564212,163845.7,34885.89,1.6619,34348.39,0.4459487,0.4404438,0.09810132,0.02872152,0.02935837,0.02899659,0.004598618,0.4361378,4.998845,0.1449028,0.2710387,0.07829157
min,0.0,0.0,4880.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1477700.0,0.0,157164.0,14627.0,1.0,19179.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2955400.0,2.0,250318.0,25434.0,2.0,38348.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,4433099.0,3.0,437480.0,51242.0,3.0,64300.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.01785714,2.0,0.0,0.0,0.0
max,5910799.0,7.0,630065.0,129843.0,7.0,161778.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,1970.0,25.0,71.0,41.0


#### 保存

In [40]:
save_user_data(user_df, 'user_data_with_status')

### 以视频分类

In [4]:
video_df = load_video_modifiled()

In [5]:
video_groups = actions_df.groupby(['video_id'])

#### 视频14天平均watch_label

In [6]:
average_watch_label = video_groups['watch_label'].agg('mean')

In [7]:
average_watch_label = couerGroupToDF(average_watch_label, 'mean', 'average_watch_label')

In [8]:
average_watch_label

Unnamed: 0_level_0,average_watch_label
video_id,Unnamed: 1_level_1
0,0.034268
2,0.119097
3,0.000000
4,0.040000
5,0.000000
...,...
50344,0.061881
50345,0.000000
50347,0.000000
50351,0.059197


In [9]:
video_df = pd.merge(video_df, average_watch_label, on='video_id', how='left')
video_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50355 entries, 0 to 50354
Columns: 135 entries, video_id to average_watch_label
dtypes: float64(133), int32(2)
memory usage: 51.9 MB


In [10]:
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_7,video_second_class_8,video_second_class_9,video_second_class_10,video_second_class_11,video_second_class_12,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.265625,0.265625,0.015625,0.155878
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.53125,0.03125,0.153834
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.2125,0.0125,0.0125,0.103516
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.015625,0.015625,0.515625,0.015625,0.015625,0.015625,0.015625,0.265625,0.015625,0.0


#### 视频14天被观看次数

In [11]:
sum_watch_times = video_groups['is_watch'].apply(lambda x: sum(x == 1))

In [12]:
sum_watch_times = couerGroupToDF(sum_watch_times, 'sum', 'sum_watch_times')

In [13]:
video_df = pd.merge(video_df, sum_watch_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_8,video_second_class_9,video_second_class_10,video_second_class_11,video_second_class_12,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.015625,0.015625,0.015625,0.015625,0.015625,0.265625,0.265625,0.015625,0.155878,52.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.03125,0.03125,0.03125,0.03125,0.03125,0.53125,0.03125,0.153834,633.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.0125,0.0125,0.0125,0.2125,0.0125,0.0125,0.103516,63.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.015625,0.515625,0.015625,0.015625,0.015625,0.015625,0.265625,0.015625,0.0,0.0


#### 视频14天完播数
> 完播次数/总播放次数
> 完播放（watch_label >= 8）

In [14]:
watch_over_times = video_groups['watch_label'].apply(lambda x: sum(x >= 8))

In [16]:
watch_over_times = couerGroupToDF(watch_over_times, 'sum', 'watch_over_times')

In [18]:
watch_over_times

Unnamed: 0_level_0,watch_over_times
video_id,Unnamed: 1_level_1
0,2
2,5
3,0
4,0
5,0
...,...
50344,4
50345,0
50347,0
50351,1


In [19]:
video_df = pd.merge(video_df, watch_over_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_9,video_second_class_10,video_second_class_11,video_second_class_12,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times,watch_over_times
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.015625,0.015625,0.015625,0.015625,0.265625,0.265625,0.015625,0.155878,52.0,5.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.03125,0.03125,0.03125,0.03125,0.53125,0.03125,0.153834,633.0,50.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.0125,0.0125,0.2125,0.0125,0.0125,0.103516,63.0,2.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.515625,0.015625,0.015625,0.015625,0.015625,0.265625,0.015625,0.0,0.0,0.0


#### 视频14天跳过数
> 跳过（watch_label = 0）

In [20]:
sum_skip_times = video_groups['watch_label'].apply(lambda x: sum(x == 0))

In [21]:
sum_skip_times = couerGroupToDF(sum_skip_times, 'sum_skip_times','sum_skip_times')

In [22]:
sum_skip_times

Unnamed: 0_level_0,sum_skip_times
video_id,Unnamed: 1_level_1
0,957
2,478
3,34
4,124
5,27
...,...
50344,794
50345,1
50347,8
50351,463


In [23]:
video_df = pd.merge(video_df, sum_skip_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_10,video_second_class_11,video_second_class_12,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times,watch_over_times,sum_skip_times
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.015625,0.015625,0.015625,0.265625,0.265625,0.015625,0.155878,52.0,5.0,725.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.03125,0.03125,0.03125,0.53125,0.03125,0.153834,633.0,50.0,6261.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.0125,0.2125,0.0125,0.0125,0.103516,63.0,2.0,494.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,0.020833,0.020833,0.020833,,,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.015625,0.015625,0.015625,0.015625,0.265625,0.015625,0.0,0.0,0.0,37.0


#### 视频14天评论数

In [24]:
comments_times = video_groups['is_comment'].agg('sum')

In [25]:
couerGroupToDF = couerGroupToDF(comments_times, 'couerGroupToDF', 'couerGroupToDF')

In [26]:
video_df = pd.merge(video_df, comments_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_11,video_second_class_12,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times,watch_over_times,sum_skip_times,is_comment
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.015625,0.015625,0.265625,0.265625,0.015625,0.155878,52.0,5.0,725.0,0.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.03125,0.03125,0.53125,0.03125,0.153834,633.0,50.0,6261.0,0.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.2125,0.0125,0.0125,0.103516,63.0,2.0,494.0,0.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,0.020833,0.020833,,,,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.015625,0.015625,0.015625,0.265625,0.015625,0.0,0.0,0.0,37.0,0.0


#### 视频14天收藏数

In [40]:
from utils import *

In [44]:
collect_times = video_groups['is_collect'].agg('sum')

In [45]:
collect_times = couerGroupToDF(collect_times, 'collect_times', 'collect_times')

In [46]:
collect_times

Unnamed: 0_level_0,collect_times
video_id,Unnamed: 1_level_1
0,0
2,0
3,0
4,0
5,0
...,...
50344,0
50345,0
50347,0
50351,2


In [47]:
video_df = pd.merge(video_df, collect_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times,watch_over_times,sum_skip_times,is_comment,couerGroupToDF_x,couerGroupToDF_y,collect_times
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.265625,0.015625,0.155878,52.0,5.0,725.0,0.0,0.0,0.0,0.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.53125,0.03125,0.153834,633.0,50.0,6261.0,0.0,1.0,1.0,1.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.0125,0.0125,0.103516,63.0,2.0,494.0,0.0,0.0,0.0,0.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,,,,,,,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.265625,0.015625,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0


#### 视频14天分享数

In [53]:
share_times = video_groups['is_share'].apply(lambda x: sum(x==1))

In [56]:
share_times = couerGroupToDF(share_times,'share_times','share_times')

In [57]:
share_times

Unnamed: 0_level_0,share_times
video_id,Unnamed: 1_level_1
0,0
2,0
3,0
4,0
5,0
...,...
50344,0
50345,0
50347,0
50351,0


In [58]:
video_df = pd.merge(video_df, share_times, on='video_id', how='left')
video_df.head()

Unnamed: 0,video_id,video_score,video_duration,video_name_0,video_name_1,video_name_2,video_name_3,video_name_4,video_name_5,video_name_6,...,video_second_class_13,video_second_class_14,video_second_class_15,average_watch_label,sum_watch_times,watch_over_times,sum_skip_times,is_comment,collect_times,share_times
0,3460,7.4,5913,0.043062,-0.07379,0.111853,-0.029774,0.02415,-0.035227,0.032603,...,0.265625,0.265625,0.015625,0.155878,52.0,5.0,725.0,0.0,0.0,0.0
1,14553,5.6,6217,0.041421,-0.012366,0.102273,-0.083679,-0.017905,-0.002874,0.029637,...,0.03125,0.53125,0.03125,0.153834,633.0,50.0,6261.0,0.0,1.0,0.0
2,1214,6.8,5963,0.127346,-0.13249,0.105639,0.009054,0.09711,-0.035699,0.022208,...,0.2125,0.0125,0.0125,0.103516,63.0,2.0,494.0,0.0,0.0,0.0
3,30639,,17371,0.062892,-0.053055,0.049159,-0.029343,0.010458,-0.009849,-0.001803,...,0.020833,0.020833,0.020833,,,,,,,
4,38522,7.7,10608,0.071006,-0.093382,0.105073,-0.030376,0.05178,0.00491,0.018744,...,0.015625,0.265625,0.015625,0.0,0.0,0.0,37.0,0.0,0.0,0.0


#### 保存

In [60]:
save_video_temp_data(video_df, 'video_data_with_status')

保存成功
保存路径为： ../../dataset/traindata/video_features_data/video_data_with_status.csv
