In [2]:
import pandas as pd 
import numpy as np
from tqdm import tqdm 
import datetime, time, gc 
# from utils import distance, haversine, standard, pad_seq 
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter
from sklearn.metrics import roc_auc_score as auc
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [16]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem,
                                                                          100 * (start_mem - end_mem) / start_mem))
    return df 

In [3]:
%%time 

train = pd.read_hdf("data/train_1kw.hdf", key='df')
test = pd.read_hdf("data/test.hdf", key='df')

# train = pd.read_pickle("data/train67.pkl")
# test = pd.read_pickle("data/test.pkl")

ValueError: cannot set WRITEABLE flag to True of this array

In [3]:
print(train.shape, test.shape)

(10000000, 36) (1000000, 36)


# 属性分析

- 数值变量：'age', 'device_size', 

- 用户相关：
          'uid': "用户ID",  
          'age': "用户年龄", 'city': "用户所在城市", 'city_rank': "用户居住城市级别",
          'device_name': "用户手机名字", 'device_size': "用户手机大小", 'career': "用户职业", 
          'gender': "用户性别", 'net_type': "行为发生时网络状态",
          'residence': "用户居住省份", 'his_app_size': "APP存储大小", 'his_on_shelf_time': "APP发布时间", 
          'app_score': "APP评分", 'emui_dev': "emui版本", 'list_time': "手机发布时间", 
          'device_price': "手机价格", 'up_life_duration': "华为ID期限",
          'up_membership_grade':"服务会员资格级别", 'membership_life_duration': "会员资格期限", 
          'consume_purchase':"用户付款标签", 'communication_onlinerate':"手机在线时间", 
          'communication_avgonline_30d': "手机日常活跃时间", 
          'pt_d': "行为发生时日期"
          
- 广告相关：
          'task_id': "广告ID", 
          'adv_id': "广告材料ID", 'creat_type_cd': "广告创新类型ID", 'adv_prim_id': "广告商ID",
          'dev_id': "广告开发者ID", 'inter_type_cd': "广告材料展示形式", 'slot_id':"广告位ID", 
          'spread_app_id': "广告APP-ID", 'tags': "广告APP标签",
          'app_first_class': "广告APP一级类别", 'app_second_class': "广告APP二级类别", 
          'indu_name': "广告行业信息",

In [5]:
# col2definition = {'label': "是否点击", 'uid': "用户ID", 'task_id': "广告ID", 
#                  'adv_id': "广告材料ID", 'creat_type_cd': "广告创新类型ID", 'adv_prim_id': "广告商ID",
#                  'dev_id': "广告开发者ID", 'inter_type_cd': "广告材料展示形式", 'slot_id':"广告位ID", 
#                  'spread_app_id': "广告APP-ID", 'tags': "广告APP标签",
#                  'app_first_class': "广告APP一级类别", 'app_second_class': "广告APP二级类别", 
#                  'age': "用户年龄", 'city': "用户所在城市", 'city_rank': "用户居住城市级别",
#                  'device_name': "用户手机名字", 'device_size': "用户手机大小", 'career': "用户职业", 
#                  'gender': "用户性别", 'net_type': "行为发生时网络状态",
#                  'residence': "用户居住省份", 'his_app_size': "APP存储大小", 'his_on_shelf_time': "APP发布时间", 
#                  'app_score': "APP评分", 'emui_dev': "emui版本", 'list_time': "手机发布时间", 
#                  'device_price': "手机价格", 'up_life_duration': "华为ID期限",
#                  'up_membership_grade':"服务会员资格级别", 'membership_life_duration': "会员资格期限", 
#                  'consume_purchase':"用户付款标签", 'communication_onlinerate':"手机在线时间", 
#                  'communication_avgonline_30d': "手机日常活跃时间", 'indu_name': "广告行业信息",
#                  'pt_d': "行为发生时日期", 
#                  'id': "行号"}

# for col in (set(train.columns) & set(test.columns)):
#     print(col, col2definition[col])
#     print(train[col].nunique(), test[col].nunique(), len(set(train[col].unique()) & set(test[col].unique())))

In [4]:
test['label'] = -1
all_data = pd.concat([train, test], ignore_index=True)

all_data['communication_hours'] = all_data['communication_onlinerate'].apply(lambda x: len(x.split('^')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [5]:
for col in all_data.columns:
#     if all_data[col].nunique() > 15:
    print(col, all_data[col].nunique())

adv_id 3643
adv_prim_id 91
age 8
app_first_class 3
app_score 2
app_second_class 17
career 9
city 344
city_rank 4
communication_avgonline_30d 14
communication_onlinerate 24
consume_purchase 9
creat_type_cd 8
dev_id 44
device_name 89
device_price 7
device_size 187
emui_dev 16
gender 3
his_app_size 21
his_on_shelf_time 3
indu_name 35
inter_type_cd 4
label 3
list_time 16
membership_life_duration 20
net_type 5
pt_d 3
residence 36
slot_id 12
spread_app_id 60
tags 22
task_id 2765
uid 911825
up_life_duration 21
up_membership_grade 4


# labelencoder 

In [5]:
cate_cols = [
       'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'city',
       'device_name', 'career', 'gender', 'net_type',
       'residence', 'his_on_shelf_time', "communication_onlinerate", 
       'up_membership_grade', 'consume_purchase', 'indu_name',
]

for col in tqdm(cate_cols):
#     print(col)
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])
    all_data[col] = all_data[col].astype(np.int32)

100%|██████████| 23/23 [00:27<00:00,  1.20s/it]


# 统计特征

In [6]:
%%time

def group_fea(df,key,target):
    tmp = df.groupby(key, as_index=False)[target].agg({
        key+target + '_nunique': 'nunique',
    }).reset_index()
    del tmp['index']
    # print("**************************{}**************************".format(target))
    return tmp

feature_key = ['uid','age', 'career','city']
feature_target = ['task_id','adv_id','dev_id','slot_id', 'adv_prim_id', 'spread_app_id','indu_name']

for key in tqdm(feature_key):
    for target in feature_target:
        tmp = group_fea(all_data, key, target)
        all_data = all_data.merge(tmp, on=key, how='left')


# 全局特征
# print("======统计全局特征======")
# for col in tqdm(['task_id', 'adv_id', 'spread_app_id']):
# #     print(col)
#     all_data['uid_{}_nunique'.format(col)] = all_data.groupby(['uid'])[col].transform('nunique')
#     all_data['uid_{}_count'.format(col)] = all_data.groupby(['uid'])[col].transform('count')
#     all_data['uid_{}_rate'.format(col)] = all_data['uid_{}_nunique'.format(col)] / all_data['uid_{}_count'.format(col)]
    
#     if col in ['task_id', 'adv_id']:
#         all_data['{}_uid_nunique'.format(col)] = all_data.groupby([col])['uid'].transform('nunique')
#         all_data['{}_uid_count'.format(col)] = all_data.groupby([col])['uid'].transform('count')


# 计数特征
print("======计数特征======")
for col in tqdm(['adv_id', 'adv_prim_id', 'task_id', 'uid', 'city']):
#     print(col)
    col_name = '{}_count_fea'.format(col)
    all_data[col_name] = all_data[col].map(all_data[col].value_counts())
    all_data[col_name] = (all_data[col_name]-all_data[col_name].min()) / (all_data[col_name].max()-all_data[col_name].min())
    all_data[col_name] = all_data[col_name].astype(np.float32)

100%|██████████| 4/4 [06:54<00:00, 103.64s/it]
  0%|          | 0/5 [00:00<?, ?it/s]



100%|██████████| 5/5 [00:12<00:00,  2.46s/it]

CPU times: user 7min 54s, sys: 2min 59s, total: 10min 53s
Wall time: 7min 6s





In [7]:
# 交叉特征计数特征
usr_cols = ['uid', 'age', 'gender', 'city', 'career'] 
adv_cols = ['task_id', 'adv_id', 'spread_app_id', 'adv_prim_id', 'dev_id', 'app_second_class']
all_data['cnt'] = 1


# 组合特征点击次数
for i in tqdm(range(len(usr_cols))):
    for j in range(len(adv_cols)):
        col_name = "cnt_click_of_"+usr_cols[i]+"_and_"+adv_cols[j]
        s = time.time()
        se = all_data.groupby([usr_cols[i], adv_cols[j]])['cnt'].sum()
        dt = all_data[[usr_cols[i], adv_cols[j]]]
        se = (pd.merge(dt, se.reset_index(), how='left',
                        on=[usr_cols[i], adv_cols[j]]).sort_index()['cnt'].fillna(value=0)).astype(np.int32)
        semax = se.max()
        semin = se.min()
        all_data[col_name] = ((se-se.min())/(se.max()-se.min())*100).fillna(value=0).astype(np.int32).values

100%|██████████| 5/5 [01:50<00:00, 22.02s/it]


##  保存计数特征

In [19]:
cnt_click = ['cnt_click_of_uid_and_task_id',
       'cnt_click_of_uid_and_adv_id', 'cnt_click_of_uid_and_spread_app_id',
       'cnt_click_of_uid_and_adv_prim_id', 'cnt_click_of_uid_and_dev_id',
       'cnt_click_of_uid_and_app_second_class', 'cnt_click_of_age_and_task_id',
       'cnt_click_of_age_and_adv_id', 'cnt_click_of_age_and_spread_app_id',
       'cnt_click_of_age_and_adv_prim_id', 'cnt_click_of_age_and_dev_id',
       'cnt_click_of_age_and_app_second_class',
       'cnt_click_of_gender_and_task_id', 'cnt_click_of_gender_and_adv_id',
       'cnt_click_of_gender_and_spread_app_id',
       'cnt_click_of_gender_and_adv_prim_id', 'cnt_click_of_gender_and_dev_id',
       'cnt_click_of_gender_and_app_second_class',
       'cnt_click_of_city_and_task_id', 'cnt_click_of_city_and_adv_id',
       'cnt_click_of_city_and_spread_app_id',
       'cnt_click_of_city_and_adv_prim_id', 'cnt_click_of_city_and_dev_id',
       'cnt_click_of_city_and_app_second_class',
       'cnt_click_of_career_and_task_id', 'cnt_click_of_career_and_adv_id',
       'cnt_click_of_career_and_spread_app_id',
       'cnt_click_of_career_and_adv_prim_id', 'cnt_click_of_career_and_dev_id',
       'cnt_click_of_career_and_app_second_class']
all_data[cnt_click].to_hdf("data/cnt_click_quchong.hdf", 'df')

all_data.drop(cnt_click, axis=1, inplace=True)

## 计算ctr特征

In [8]:
from utils import HyperParam
random_sector = np.random.randint(1, 6, size=(all_data.shape[0])).astype(np.int32)  # 1,2,3,4,5
all_data['random_sector'] = random_sector
all_data.loc[all_data.label == -1, 'random_sector'] = 0
print(all_data['random_sector'].value_counts())

3    2002113
5    2001070
2    2000315
1    1999757
4    1996745
0    1000000
Name: random_sector, dtype: int64


In [10]:
# 计算单特征的转换率

since = time.time()
sec_size = 5
frac_size = 0.8
convert_feature = ['uid', 'task_id', 'adv_id', 'city', 'spread_app_id']
for index, feature in enumerate(convert_feature):
    print('正在计算' + feature + '转换率')
    for sec in range(sec_size + 1):  # 0, 1, 2, 3, 4, 5  #0 is test， 1 is valid
        print(sec, '折')
        if sec == 1 or sec == 0:
            temp = all_data[(all_data.label != -1)&(all_data.random_sector != sec)][[feature, 'label']]
        else:
            temp = all_data[(all_data.label != -1)&(all_data.random_sector != sec)&(all_data.random_sector != 1)][[feature, 'label']]
            #temp = temp.sample(frac = frac_size, random_state = 2019).reset_index(drop = True)
        
        temp[feature + '_all_count'] = temp.groupby(feature).label.transform('count')
        temp[feature + '_label_count'] = temp.groupby(feature).label.transform('sum')
        # 贝叶斯平滑
        HP = HyperParam(1, 1)
        HP.update_from_data_by_moment(temp[feature + '_all_count'].values, temp[feature + '_label_count'].values)
        temp[feature + '_ctr'] = (temp[feature + '_label_count'] + HP.alpha) / (temp[feature + '_all_count'] + HP.alpha + HP.beta)
        print('temp before shape:', temp.shape)
        temp = temp[[feature, feature + '_ctr']].drop_duplicates()
        print('temp after shape:', temp.shape)

        sec_data = all_data[all_data.random_sector == sec][[feature]]
        all_data.loc[all_data.random_sector == sec, feature + '_ctr'] = pd.merge(sec_data, temp, on=feature,
                                                                                     how='left')[feature+'_ctr'].values
        del temp, sec_data
        gc.collect()
    all_data[feature + '_ctr'] = all_data[feature + '_ctr'].astype(np.float32)
time_elapsed = time.time() - since
print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

正在计算uid转换率
0 折
temp before shape: (10000000, 5)
temp after shape: (920630, 2)
1 折
temp before shape: (8000243, 5)
temp after shape: (879798, 2)
2 折
temp before shape: (5999928, 5)
temp after shape: (823627, 2)
3 折
temp before shape: (5998130, 5)
temp after shape: (823592, 2)
4 折
temp before shape: (6003498, 5)
temp after shape: (823198, 2)
5 折
temp before shape: (5999173, 5)
temp after shape: (823692, 2)
正在计算task_id转换率
0 折
temp before shape: (10000000, 5)
temp after shape: (4147, 2)
1 折
temp before shape: (8000243, 5)
temp after shape: (4096, 2)
2 折
temp before shape: (5999928, 5)
temp after shape: (4051, 2)
3 折
temp before shape: (5998130, 5)
temp after shape: (4057, 2)
4 折
temp before shape: (6003498, 5)
temp after shape: (4044, 2)
5 折
temp before shape: (5999173, 5)
temp after shape: (4058, 2)
正在计算adv_id转换率
0 折
temp before shape: (10000000, 5)
temp after shape: (5096, 2)
1 折
temp before shape: (8000243, 5)
temp after shape: (5020, 2)
2 折
temp before shape: (5999928, 5)
temp after sh

In [11]:
print('==========计算交叉转换率==========')
since = time.time()
sec_size = 5
frac_size = 0.5  # dropout

fea_usr = ['age', 'city', 'career']
fea_adv = ['task_id', 'adv_id', 'spread_app_id', 'adv_prim_id', 'slot_id']
for first_feature in fea_usr:
    for second_feature in fea_adv:
        print('正在计算' + first_feature + '和' + second_feature + '的转换率')
        ctr_fea = "{}_{}_ctr".format(first_feature, second_feature)
        
        for sec in range(sec_size + 1):
            print(sec, '折')
            if sec == 1 or sec == 0:
                temp = all_data[(all_data.label != -1)&(all_data.random_sector != sec)][[first_feature, second_feature, 'label']]
            else: 
                temp = all_data[(all_data.label != -1)&(all_data.random_sector != sec)&(all_data.random_sector != 1)][[first_feature, second_feature, 'label']]
                # temp = temp.sample(frac = frac_size, random_state = 2019).reset_index(drop = True)
            
            temp['query_title_all_count'] = temp.groupby([first_feature, second_feature]).label.transform('count')
            temp['query_title_label_count'] = temp.groupby([first_feature, second_feature]).label.transform('sum')
            
            HP = HyperParam(1, 1)
            HP.update_from_data_by_moment(temp['query_title_all_count'].values, temp['query_title_label_count'].values)
            
            temp['query_title_convert']=(temp['query_title_label_count']+HP.alpha)/(temp['query_title_all_count']+HP.alpha+HP.beta)
            print('temp before shape:', temp.shape)
            temp = temp[[first_feature, second_feature, 'query_title_convert']].drop_duplicates()
            print('temp after shape:', temp.shape)
            sec_data = all_data[all_data.random_sector == sec][[first_feature, second_feature]]
            
            all_data.loc[all_data.random_sector == sec, ctr_fea]=pd.merge(sec_data, temp, 
                                                                          on=[first_feature, second_feature],
                                                                          how='left')['query_title_convert'].values
            del temp, sec_data
            gc.collect()
        
        all_data[ctr_fea] = all_data[ctr_fea].astype(np.float32)
        time_elapsed = time.time() - since
        print('complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

正在计算age和task_id的转换率
0 折
temp before shape: (10000000, 6)
temp after shape: (25536, 3)
1 折
temp before shape: (8000243, 6)
temp after shape: (25025, 3)
2 折
temp before shape: (5999928, 6)
temp after shape: (24340, 3)
3 折
temp before shape: (5998130, 6)
temp after shape: (24306, 3)
4 折
temp before shape: (6003498, 6)
temp after shape: (24289, 3)
5 折
temp before shape: (5999173, 6)
temp after shape: (24306, 3)
complete in 2m 9s
正在计算age和adv_id的转换率
0 折
temp before shape: (10000000, 6)
temp after shape: (28965, 3)
1 折
temp before shape: (8000243, 6)
temp after shape: (28199, 3)
2 折
temp before shape: (5999928, 6)
temp after shape: (27244, 3)
3 折
temp before shape: (5998130, 6)
temp after shape: (27160, 3)
4 折
temp before shape: (6003498, 6)
temp after shape: (27163, 3)
5 折
temp before shape: (5999173, 6)
temp after shape: (27184, 3)
complete in 4m 19s
正在计算age和spread_app_id的转换率
0 折
temp before shape: (10000000, 6)
temp after shape: (543, 3)
1 折
temp before shape: (8000243, 6)
temp after shape

In [12]:
all_data.shape 

(11000000, 123)

In [13]:
all_data.head()

Unnamed: 0,adv_id,adv_prim_id,age,app_first_class,app_score,app_second_class,career,city,city_rank,communication_avgonline_30d,communication_onlinerate,consume_purchase,creat_type_cd,dev_id,device_name,device_price,device_size,emui_dev,gender,his_app_size,his_on_shelf_time,id,indu_name,inter_type_cd,label,list_time,membership_life_duration,net_type,pt_d,residence,slot_id,spread_app_id,tags,task_id,uid,up_life_duration,up_membership_grade,communication_hours,uidtask_id_nunique,uidadv_id_nunique,uiddev_id_nunique,uidslot_id_nunique,uidadv_prim_id_nunique,uidspread_app_id_nunique,uidindu_name_nunique,agetask_id_nunique,ageadv_id_nunique,agedev_id_nunique,ageslot_id_nunique,ageadv_prim_id_nunique,agespread_app_id_nunique,ageindu_name_nunique,careertask_id_nunique,careeradv_id_nunique,careerdev_id_nunique,careerslot_id_nunique,careeradv_prim_id_nunique,careerspread_app_id_nunique,careerindu_name_nunique,citytask_id_nunique,cityadv_id_nunique,citydev_id_nunique,cityslot_id_nunique,cityadv_prim_id_nunique,cityspread_app_id_nunique,cityindu_name_nunique,adv_id_count_fea,adv_prim_id_count_fea,task_id_count_fea,uid_count_fea,city_count_fea,cnt,cnt_click_of_uid_and_task_id,cnt_click_of_uid_and_adv_id,cnt_click_of_uid_and_spread_app_id,cnt_click_of_uid_and_adv_prim_id,cnt_click_of_uid_and_dev_id,cnt_click_of_uid_and_app_second_class,cnt_click_of_age_and_task_id,cnt_click_of_age_and_adv_id,cnt_click_of_age_and_spread_app_id,cnt_click_of_age_and_adv_prim_id,cnt_click_of_age_and_dev_id,cnt_click_of_age_and_app_second_class,cnt_click_of_gender_and_task_id,cnt_click_of_gender_and_adv_id,cnt_click_of_gender_and_spread_app_id,cnt_click_of_gender_and_adv_prim_id,cnt_click_of_gender_and_dev_id,cnt_click_of_gender_and_app_second_class,cnt_click_of_city_and_task_id,cnt_click_of_city_and_adv_id,cnt_click_of_city_and_spread_app_id,cnt_click_of_city_and_adv_prim_id,cnt_click_of_city_and_dev_id,cnt_click_of_city_and_app_second_class,cnt_click_of_career_and_task_id,cnt_click_of_career_and_adv_id,cnt_click_of_career_and_spread_app_id,cnt_click_of_career_and_adv_prim_id,cnt_click_of_career_and_dev_id,cnt_click_of_career_and_app_second_class,random_sector,uid_ctr,task_id_ctr,adv_id_ctr,city_ctr,spread_app_id_ctr,age_task_id_ctr,age_adv_id_ctr,age_spread_app_id_ctr,age_adv_prim_id_ctr,age_slot_id_ctr,city_task_id_ctr,city_adv_id_ctr,city_spread_app_id_ctr,city_adv_prim_id_ctr,city_slot_id_ctr,career_task_id_ctr,career_adv_id_ctr,career_spread_app_id_ctr,career_adv_prim_id_ctr,career_slot_id_ctr
0,2967,53,5,2,2,10,2,148,3,10,2116,0,5,40,69,5,162,20,0,14,3,,6,3,0,4,-1,0,3,8,6,44,24,188,355482,18,0,18,14,14,6,4,8,6,6,4101,4975,56,12,107,74,38,4235,5145,56,12,107,74,38,1782,1824,45,12,87,62,30,0.142194,0.68349,0.142194,0.041463,0.032494,1,0,0,1,1,0,1,8,8,52,70,42,100,14,14,69,69,47,100,0,0,1,2,1,4,10,10,52,64,43,93,3,0.003432,0.042314,0.042314,0.034136,0.035233,0.024919,0.024918,0.025629,0.025629,0.02673,0.013557,0.013347,0.033933,0.033919,0.032522,0.031771,0.031771,0.026887,0.026887,0.028738
1,3444,30,1,0,1,12,7,239,3,12,1258,0,4,44,60,4,141,27,0,-1,0,,32,1,0,17,-1,1,1,32,6,62,11,1987,489326,18,0,23,3,3,1,3,2,2,2,2655,2846,51,12,96,66,34,4252,5007,56,12,107,73,38,1583,1616,45,12,87,59,33,0.444458,0.461616,0.444458,0.007317,0.020179,1,1,1,1,1,1,0,0,0,2,1,5,1,49,49,99,48,100,34,0,0,0,0,1,0,24,24,48,31,66,20,5,0.305896,0.01685,0.01685,0.042766,0.018847,0.017661,0.017632,0.022707,0.021713,0.041081,0.018475,0.01825,0.018046,0.030243,0.026495,0.019222,0.019221,0.020545,0.020146,0.032264
2,1271,40,5,2,2,15,7,48,5,13,1258,0,1,21,44,4,141,20,0,5,3,,29,3,0,8,-1,0,3,3,6,64,28,160,944468,-1,0,23,35,35,10,4,15,12,9,4101,4975,56,12,107,74,38,4252,5007,56,12,107,73,38,3169,3501,55,12,102,72,35,0.222653,0.868907,0.222653,0.139024,0.314422,1,1,1,4,4,3,3,13,13,65,86,27,41,28,28,88,86,32,39,3,3,16,31,8,14,18,18,78,93,33,40,1,0.00126,0.020031,0.020031,0.032301,0.027695,0.017352,0.017351,0.02045,0.020494,0.026763,0.016495,0.016481,0.025058,0.02514,0.029594,0.022377,0.022376,0.03358,0.033673,0.03227
3,4757,67,6,2,2,10,2,244,3,12,1339,0,5,14,44,4,141,20,0,18,2,,16,3,0,8,-1,0,5,31,0,1,24,4112,294551,20,0,22,22,22,7,4,13,10,10,4069,4694,56,12,107,73,38,4235,5145,56,12,107,74,38,1726,1773,49,12,89,65,31,0.134819,0.078397,0.134819,0.068293,0.025363,1,0,0,0,0,0,0,8,8,5,7,3,91,13,13,8,8,3,100,0,0,0,0,0,3,8,8,5,6,2,93,4,0.0027,0.052542,0.052542,0.03591,0.039564,0.03994,0.039941,0.036945,0.036946,0.021542,0.105286,0.106136,0.051552,0.05364,0.022838,0.044503,0.044505,0.033896,0.033895,0.020446
4,4346,46,5,2,2,10,2,336,5,11,1737,0,5,52,80,4,193,17,0,13,2,,5,2,0,14,-1,1,1,17,1,48,12,2729,578333,20,1,19,13,13,6,5,7,6,7,4101,4975,56,12,107,74,38,4235,5145,56,12,107,74,38,3529,4072,55,12,104,72,36,0.003847,0.005264,0.003847,0.036585,0.843078,1,0,0,0,0,0,0,0,0,5,0,2,100,0,0,6,0,3,100,0,0,3,0,2,87,0,0,4,0,2,93,1,0.004722,0.016613,0.016607,0.030427,0.014275,0.00839,0.008337,0.013517,0.011984,0.046155,0.017359,0.017136,0.012432,0.022401,0.060622,0.015476,0.015439,0.013674,0.016656,0.050521


In [14]:
all_data = all_data.drop(['cnt', 'random_sector'], axis=1)

In [17]:
all_data = reduce_mem_usage(all_data)

100%|██████████| 121/121 [00:41<00:00,  2.93it/s]

Mem. usage decreased to 1814.84 Mb (70.2% reduction)





In [19]:
train_df = all_data.iloc[:-1000000]
test_df = all_data.iloc[-1000000:].reset_index(drop=True)

In [25]:
##########################target_enc feature#######################
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
enc_list = ['net_type', 'task_id', 'adv_id', 'adv_prim_id', 'age', 'app_first_class',
            'app_second_class', 'career', 'city', 'consume_purchase', 
            'uid', 'dev_id', 'tags', 'slot_id']

for f in tqdm(enc_list):
    train_df[f + '_target_enc'] = 0
    test_df[f + '_target_enc'] = 0
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        trn_x = train_df[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
        val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
        val_x = val_x.merge(enc_df, on=f, how='left')
        test_x = test_df[[f]].merge(enc_df, on=f, how='left')
        val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_df['label'].mean())
        test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_df['label'].mean())
        train_df.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
        test_df[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
100%|██████████| 14/14 [01:25<00:00,  6.11s/it]


In [34]:
train_df.shape, test_df.shape 

((10000000, 135), (1000000, 134))

In [29]:
del test_df['label']

In [32]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.o

Mem. usage decreased to 2275.03 Mb (25.1% reduction)


100%|██████████| 134/134 [00:02<00:00, 56.75it/s]

Mem. usage decreased to 183.11 Mb (30.4% reduction)





In [35]:
train_df.to_hdf("data/train_1kw_fea.hdf", 'df')
test_df.to_hdf("data/test_1kw_fea.hdf", 'df')

In [25]:
ctr = ['uid_ctr', 'task_id_ctr',
       'adv_id_ctr', 'age_task_id_ctr', 'age_adv_id_ctr',
       'age_spread_app_id_ctr', 'age_adv_prim_id_ctr', 'age_dev_id_ctr',
       'age_app_second_class_ctr', 'gender_task_id_ctr', 'gender_adv_id_ctr',
       'gender_spread_app_id_ctr', 'gender_adv_prim_id_ctr',
       'gender_dev_id_ctr', 'gender_app_second_class_ctr', 'city_task_id_ctr',
       'city_adv_id_ctr', 'city_spread_app_id_ctr', 'city_adv_prim_id_ctr',
       'city_dev_id_ctr', 'city_app_second_class_ctr', 'career_task_id_ctr',
       'career_adv_id_ctr', 'career_spread_app_id_ctr',
       'career_adv_prim_id_ctr', 'career_dev_id_ctr',
       'career_app_second_class_ctr']
all_data[ctr].to_hdf("data/ctr_quchong.hdf", 'df')

In [27]:
all_data.drop(ctr, axis=1, inplace=True)

In [28]:
all_data.columns

Index(['adv_id', 'adv_prim_id', 'age', 'app_first_class', 'app_score',
       'app_second_class', 'career', 'city', 'city_rank',
       'communication_avgonline_30d', 'communication_onlinerate',
       'consume_purchase', 'creat_type_cd', 'dev_id', 'device_name',
       'device_price', 'device_size', 'emui_dev', 'gender', 'his_app_size',
       'his_on_shelf_time', 'indu_name', 'inter_type_cd', 'label', 'list_time',
       'net_type', 'pt_d', 'residence', 'slot_id', 'spread_app_id', 'tags',
       'task_id', 'uid', 'up_life_duration', 'up_membership_grade',
       'uid_task_id_nunique', 'uid_task_id_count', 'uid_task_id_rate',
       'task_id_uid_nunique', 'task_id_uid_count', 'uid_adv_id_nunique',
       'uid_adv_id_count', 'uid_adv_id_rate', 'adv_id_uid_nunique',
       'adv_id_uid_count', 'uid_spread_app_id_nunique',
       'uid_spread_app_id_count', 'uid_spread_app_id_rate', 'adv_id_count_fea',
       'adv_prim_id_count_fea', 'task_id_count_fea', 'uid_count_fea',
       'city_count

In [29]:
all_data.drop(['cnt', 'random_sector'], axis=1, inplace=True)

In [30]:
all_data.to_hdf("data/all_data_fea_quchong.hdf", 'df')

In [31]:
all_data[:-1000000].shape 

(35056562, 53)

In [22]:
print(all_data[['label', 'uid_ctr', 'task_id_ctr', 'adv_id_ctr',]][:-1000000].count())
print(all_data[['label', 'uid_ctr', 'task_id_ctr', 'adv_id_ctr']][-1000000:].count())

label          41907133
uid_ctr        41766322
task_id_ctr    41906943
adv_id_ctr     41906919
dtype: int64
label          1000000
uid_ctr         788460
task_id_ctr     873268
adv_id_ctr      873268
dtype: int64


In [16]:
all_data[['uid_ctr', 'task_id_ctr', 'adv_id_ctr']].to_pickle("ctr_fea_67.pkl")

In [17]:
all_data.head()

Unnamed: 0,adv_id,adv_prim_id,age,app_first_class,app_score,app_second_class,career,city,city_rank,communication_avgonline_30d,communication_onlinerate,consume_purchase,creat_type_cd,dev_id,device_name,device_price,device_size,emui_dev,gender,his_app_size,his_on_shelf_time,indu_name,inter_type_cd,label,list_time,membership_life_duration,net_type,pt_d,residence,slot_id,spread_app_id,tags,task_id,uid,up_life_duration,up_membership_grade,uid_task_id_nunique,uid_task_id_count,uid_task_id_rate,task_id_uid_nunique,task_id_uid_count,uid_adv_id_nunique,uid_adv_id_count,uid_adv_id_rate,adv_id_uid_nunique,adv_id_uid_count,uid_spread_app_id_nunique,uid_spread_app_id_count,uid_spread_app_id_rate,adv_id_count_fea,adv_prim_id_count_fea,task_id_count_fea,uid_count_fea,cnt,cnt_click_of_uid_and_task_id,cnt_click_of_uid_and_adv_id,cnt_click_of_uid_and_spread_app_id,cnt_click_of_uid_and_adv_prim_id,cnt_click_of_age_and_task_id,cnt_click_of_age_and_adv_id,cnt_click_of_age_and_spread_app_id,cnt_click_of_age_and_adv_prim_id,cnt_click_of_gender_and_task_id,cnt_click_of_gender_and_adv_id,cnt_click_of_gender_and_spread_app_id,cnt_click_of_gender_and_adv_prim_id,cnt_click_of_city_and_task_id,cnt_click_of_city_and_adv_id,cnt_click_of_city_and_spread_app_id,cnt_click_of_city_and_adv_prim_id,cnt_click_of_career_and_task_id,cnt_click_of_career_and_adv_id,cnt_click_of_career_and_spread_app_id,cnt_click_of_career_and_adv_prim_id,random_sector,uid_ctr,task_id_ctr,adv_id_ctr
0,1167,20,4,2,2,2,7,20,3,9,16,0,4,7,52,2,141,14,2,7,2,29,3,0,10,-1,0,6,23,7,33,10,2736,857104,20,1,9,14,0.642857,6718,10104,9,14,0.642857,6718,10104,3,14,0.214286,0.049379,0.056679,0.049379,0.023636,1,3,3,1,2,2,2,7,5,2,2,4,3,1,1,1,2,2,2,4,4,3,0.008933,0.021071,0.021071
1,3323,16,5,2,2,16,5,256,4,13,22,0,5,6,88,5,141,27,0,14,2,3,3,0,14,-1,0,6,2,5,32,3,1761,591845,20,1,18,61,0.295082,12757,27448,22,61,0.360656,12757,27448,6,61,0.098361,0.134148,0.049966,0.134148,0.109091,1,7,7,5,8,10,10,5,4,13,13,7,5,1,1,0,0,0,0,0,0,5,0.023795,0.025882,0.025882
2,646,33,5,2,2,13,4,86,4,7,13,0,1,18,8,4,193,20,0,5,2,28,3,0,15,-1,1,6,26,1,50,20,382,899985,18,0,12,53,0.226415,44127,95220,12,53,0.226415,44127,95220,4,53,0.075472,0.465386,0.713506,0.465386,0.094545,1,2,2,7,11,26,26,64,70,59,59,69,68,4,4,7,12,2,2,3,3,5,0.001947,0.018652,0.018652
3,1901,33,7,2,2,13,2,210,3,12,24,3,1,18,44,2,193,14,0,5,2,28,3,1,13,-1,1,6,28,1,50,20,1023,811208,18,0,11,18,0.611111,21719,36791,11,18,0.611111,21719,36791,3,18,0.166667,0.179813,0.713506,0.179813,0.030909,1,5,5,6,9,6,6,41,46,22,22,69,68,0,0,1,3,13,13,65,66,3,0.228888,0.015826,0.015825
4,111,33,4,2,2,13,2,120,5,13,24,0,3,18,29,4,193,20,2,5,2,28,3,0,15,-1,0,6,1,7,50,20,45,812082,16,0,35,47,0.744681,27008,69620,35,47,0.744681,19428,36209,13,47,0.276596,0.176968,0.713506,0.340265,0.083636,1,0,0,4,5,23,12,45,47,43,22,27,25,6,3,12,20,30,15,65,66,4,0.002682,0.016241,0.015375


In [20]:
# all_data.dtypes 

In [18]:
# del train, test
gc.collect()

0

In [15]:
del all_data['pt_d']
gc.collect()
# del all_data['random_sector']

7

In [15]:
# %%time

for col in tqdm(all_data.columns):
    if all_data[col].dtype == 'int64':
        all_data[col] = all_data[col].astype(np.int32)
    elif all_data[col].dtype == 'float64':
        all_data[col] = all_data[col].astype(np.float32)

100%|██████████| 85/85 [00:06<00:00, 13.93it/s]


In [22]:
print(all_data.shape)

(13027161, 77)


In [24]:
train_fea = all_data[all_data['label'] != -1].reset_index(drop=True)
test_fea = all_data[all_data['label'] == -1].reset_index(drop=True)
del test_fea['label']
print(train_fea.shape, test_fea.shape)

(12027161, 77) (1000000, 76)


In [25]:
train_fea.to_hdf("train_fea67_0724.hdf", 'df')
test_fea.to_hdf("test_fea67_0724.hdf", 'df')

In [30]:
print(train_fea[train_fea['random_sector'] != 1][['label', 'uid_ctr', 'task_id_ctr', 'adv_id_ctr']].count())

print(train_fea[train_fea['random_sector'] == 1][['label', 'uid_ctr', 'task_id_ctr', 'adv_id_ctr']].count())

print(test_fea[['uid_ctr', 'task_id_ctr', 'adv_id_ctr']].count())

label          9625702
uid_ctr        9449917
task_id_ctr    9625509
adv_id_ctr     9625353
dtype: int64
label          2401459
uid_ctr        2372867
task_id_ctr    2401432
adv_id_ctr     2401423
dtype: int64
uid_ctr        755178
task_id_ctr    796451
adv_id_ctr     796432
dtype: int64


In [28]:
# train_fea.to_pickle("train_fea0724.pkl")
# test_fea.to_pickle("test_fea0724.pkl")