In [1]:
import pandas as pd
import pickle
import datetime
import numpy as np
from collections import Counter
from itertools import chain 
import re
pd.set_option("display.max_columns",5000)
import matplotlib.pyplot as plt

In [2]:
col=['aid','title','year','nation','language','types','tags','actors','directors']
date = datetime.datetime.today()- datetime.timedelta(1)
yes_date = (datetime.datetime.today() - datetime.timedelta(1)).strftime("%Y%m%d")
media = pd.read_parquet("/data/gangyanyuan/data_rr/doubaninfo_detail_merged_rr_v2")

ldd = media[(media.channel_name=='少儿')&(media.operate_status=='1' )]
print(ldd.shape[0])
ldd = ldd[ldd.vendor_list.apply(lambda x:"310" in x)]
print(ldd.shape[0])
ldd.groupby("content_type").aid.nunique()

20236
12081


content_type
1    12081
Name: aid, dtype: int64

# get data

In [3]:
log = pd.read_parquet(f"/data/gangyanyuan/data_rr/user_click_exposure_video_raw_ldd_rr/year=2023/month=8/day=11")
log.head(2)

Unnamed: 0,dnum,activeName,baseCid,cid,dateTime,position,columnsName,algo_type
0,414159675,click,TZvnQgWOjc1665211858040,YxZosVLdrc1612689195477,2023-08-11 11:37:40,"{0,2}",【猜你喜欢↓↓↓】,trex
1,545185226,show,HgCggrKDkP1629110445085,EcjztyVhJg1689303456414,2023-08-11 15:47:36,"{0,2}",热播风向标：和全球小伙伴一起看,


In [4]:
def get_dnum_eps_ndays(date_range):
    data_dnum = []

    for i in range(date_range):
        date = datetime.datetime.today() - datetime.timedelta(i+1)
        log = pd.read_parquet(f"/data/gangyanyuan/data_rr/user_click_exposure_video_raw_ldd_rr/year={date.year}/month={date.month}/day={date.day}")
        log = pd.merge(log, media[['aid']],left_on='baseCid',right_on="aid")
        log = log[log.algo_type!='']
        dnum = log.groupby(["dnum","activeName","algo_type"]).agg({"cid":"count"}).reset_index()
        dnum = pd.pivot(dnum, index=["dnum","algo_type"], columns=['activeName'], values=['cid']).reset_index()
        dnum['date'] = date.strftime("%Y%m%d")
        data_dnum.append(dnum)
        del log, dnum
        
    data_dnum = pd.concat(data_dnum)
    data_dnum.columns = ['dnum', 'algo_type', 'clk_pv','show_pv','date']
    
    # 判断冷启动用户
    data_dnum['is_cold_start'] = data_dnum.groupby('dnum')['date'].transform(lambda x: ((pd.to_datetime(x, format="%Y%m%d") - pd.to_datetime(x.max(), format="%Y%m%d")).dt.days >= -7) & ((pd.to_datetime(x, format="%Y%m%d") - pd.to_datetime(x.max(), format="%Y%m%d")).dt.days <= -1))
    
    # 分组冷启动用户和非冷启动用户
    cold_start_users = data_dnum[data_dnum['is_cold_start'] == True]
    non_cold_start_users = data_dnum[data_dnum['is_cold_start'] == False]
    
    return cold_start_users, non_cold_start_users

In [None]:
data_dnum.head(10)

In [None]:
data_dnum['bucket'] = data_dnum.dnum.apply(lambda x: str(x)[-1] if len(x)>0 else '')

In [None]:
def get_pay_data(date_range):
    pay_data = []
    for i in range(date_range):
        date = datetime.datetime.today() - datetime.timedelta(i+1)
        path  = f'/data/gangyanyuan/data_rr/vod_ldd_ecpm_daily_details_rr/year={date.year}/month={date.month}/day={date.day}'
        data = pd.read_parquet(path)
        data['bucket'] = data.dnum.apply(lambda x: str(x)[-1])
        data['date'] = date.strftime("%Y%m%d")
        pay_data.append(data)
    pay_data = pd.concat(pay_data)
    return pay_data

In [None]:
pay_data = get_pay_data(8)

# 7天冷启动和非冷启动的ctr和ecpm

In [None]:
data_dnum['clk_pv'] =data_dnum.clk_pv.fillna(0)
data_dnum['ctr'] = data_dnum['clk_pv'] / data_dnum['show_pv']
pay_data['ecpm'] = pay_data['sd_income'] / data_dnum['show_pv']

for is_cold_start in [True, False]:
    plt.figure(figsize=(40,4))    
    for algo_type in ['replace','trex']:
        tmp = data_dnum[(data_dnum.algo_type==algo_type)&(data_dnum.is_cold_start==is_cold_start)].sort_values("date")
        plt.plot(tmp.date, tmp.ctr,label=algo_type+'_'+str(is_cold_start))
    plt.title("ctr")
    plt.legend()
    plt.show()

for is_cold_start in [True, False]:
    plt.figure(figsize=(40,4))    
    for algo_type in ['replace','trex']:
        tmp = data_dnum[(data_dnum.algo_type==algo_type)&(data_dnum.is_cold_start==is_cold_start)].sort_values("date")
        plt.plot(tmp.date, tmp.ecpm,label=algo_type+'_'+str(is_cold_start))
    plt.title("ecpm")
    plt.legend()
    plt.show()

# 没有出现在前7天的用户在当天的用户占比

In [None]:
df_date_pv = pd.data_dnum(data_dnum,columns=['is_cold_start'],index=['algo_type','date'],values=['show_pv']).reset_index()
df_date_pv.columns=['algo_type','date','show_pv_活跃','show_pv_冷启']
df_date_pv['show_pv_冷启占比'] = df_date_pv.show_pv_冷启/(df_date_pv.show_pv_活跃+df_date_pv.show_pv_冷启)*100
plt.figure(figsize=(20,4))   
for algo_type in ['replace','trex']:
    
    tmp = df_date_pv[(df_date_pv.algo_type==algo_type)].sort_values("date")
    plt.plot(tmp.date, tmp.show_pv_冷启占比,label=algo_type)
plt.title("show_pv_冷启占比%")
plt.legend()
plt.show()

In [None]:
def get_dnum_per_not_appear_in_last_ndays(data, target_date,n):
    
    date_start = (datetime.datetime.strptime(target_date,"%Y%m%d")- datetime.timedelta(n)).strftime("%Y%m%d")
    def cpr(x,target_date,n):
        if x==target_date:
            return True
        elif x<target_date and x>=date_start:
            return False
        else:
            return ''
        
    data['is_target_day'] = data.date.apply(lambda x: cpr(x,target_date,n))
    data = data[data.is_target_day!='']
    data = data.groupby(["is_target_day","bucket"]).dnum.unique().reset_index()
    data['dnum_size'] = data.dnum.apply(len)
    data_bucket = pd.pivot(data,index=['bucket'],columns=['is_target_day'],values=['dnum','dnum_size']).reset_index()
    data_bucket.columns=['bucket','历史N天用户','当天用户',"历史N天用户数量",'当天用户数量']

    data_bucket['当天冷启动用户'] = data_bucket[['历史N天用户','当天用户']].apply(lambda x:set(x.当天用户)- set(x.历史N天用户) ,axis=1)
    data_bucket['当天冷启动用户数量'] = data_bucket.当天冷启动用户.apply(len)
    
    data_bucket['当天冷启动用户数量'] = data_bucket.当天冷启动用户数量.fillna(0)
    data_bucket['当天用户数量'] = data_bucket.当天用户数量.fillna(0)
    data_bucket['当天冷启动用户占比'] = data_bucket.当天冷启动用户数量/(data_bucket.当天用户数量+1e-9)*100
    return data_bucket

In [None]:
dnum_bucket = get_dnum_per_not_appear_in_last_ndays(data_dnum, '20230816',7)

In [None]:
dnum_bucket