In [2]:
cd tencent_algo_2020/

/home/sayhi/workspaces/tencent_algo_2020


In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold

In [5]:
train_ad = pd.read_csv("dataset/train/ad.csv", na_values="\\N")
train_click = pd.read_csv("dataset/train/click_log.csv", na_values="\\N")
train_user = pd.read_csv("dataset/train/user.csv", na_values="\\N")
train_data = train_click.merge(train_ad, how="left", on="creative_id", )
train_data= train_data.merge(train_user, how="left", on="user_id", )
train_data.fillna(0,inplace=True)
train_data[['creative_id','ad_id','product_id','advertiser_id','industry']] = train_data[['creative_id','ad_id','product_id','advertiser_id','industry']].astype('object')

In [6]:
# 目标编码
def target_encode(X, cols, target_feature):
    X_ = pd.DataFrame()
    X_['user_id']= X['user_id']
    for col in tqdm(cols):
        print('Target Encoding: {}'.format(col))
        grouped=X.groupby([col])[target_feature]
        X_[col+'_target_encoded_mean'] = X[col].map(dict(grouped.mean()))
        X_[col+'_target_encoded_median'] = X[col].map(dict(grouped.median()))
        X_[col+'_target_encoded_std'] = X[col].map(dict(grouped.std()))
    return X_

In [7]:
feature = ['creative_id','ad_id','product_id','advertiser_id','industry']
target_input = train_data[feature+['age','gender','user_id']]

In [8]:
# 目标编码gender
train_target_gender = target_encode(target_input,feature,'gender')
train_target_gender = train_target_gender.groupby('user_id').agg('median')

  0%|          | 0/5 [00:00<?, ?it/s]

Target Encoding: creative_id


 20%|██        | 1/5 [02:49<11:18, 169.71s/it]

Target Encoding: ad_id


 40%|████      | 2/5 [05:25<08:16, 165.46s/it]

Target Encoding: product_id


 60%|██████    | 3/5 [05:36<03:58, 119.30s/it]

Target Encoding: advertiser_id


 80%|████████  | 4/5 [05:49<01:27, 87.24s/it] 

Target Encoding: industry


100%|██████████| 5/5 [06:14<00:00, 74.98s/it]


In [9]:
# 目标编码age
train_target_age = target_encode(target_input,feature,'age')
train_target_age  = train_target_age.groupby('user_id').agg('median')

  0%|          | 0/5 [00:00<?, ?it/s]

Target Encoding: creative_id


KeyboardInterrupt: 

In [10]:
train_target_gender.head()

Unnamed: 0_level_0,creative_id_target_encoded_mean,creative_id_target_encoded_median,creative_id_target_encoded_std,ad_id_target_encoded_mean,ad_id_target_encoded_median,ad_id_target_encoded_std,product_id_target_encoded_mean,product_id_target_encoded_median,product_id_target_encoded_std,advertiser_id_target_encoded_mean,advertiser_id_target_encoded_median,advertiser_id_target_encoded_std,industry_target_encoded_mean,industry_target_encoded_median,industry_target_encoded_std
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.210084,1.0,0.409091,1.210084,1.0,0.409091,1.329477,1.0,0.470023,1.346047,1.0,0.475709,1.41282,1.0,0.492344
2,1.289554,1.0,0.454747,1.287927,1.0,0.453575,1.315832,1.0,0.464849,1.306313,1.0,0.460453,1.311114,1.0,0.46295
3,1.746366,2.0,0.444694,1.701449,2.0,0.445747,1.37107,1.0,0.470023,1.510396,2.0,0.481895,1.378124,1.0,0.484919
4,1.333333,1.0,0.463301,1.333333,1.0,0.466519,1.315371,1.0,0.464367,1.328038,1.0,0.463159,1.3431,1.0,0.474745
5,1.196023,1.0,0.397551,1.196023,1.0,0.397551,1.329477,1.0,0.470023,1.305297,1.0,0.460561,1.3431,1.0,0.474745


In [31]:
# K折目标编码
kf = KFold(n_splits = 8, shuffle = False, random_state=2019)
def target_encode_kflod(df, cols, target_feature):
    for train_ind,val_ind in tqdm(kf.split(df)): # val_ind是K中的1块数据的索引，而train_ind是剩下的K-1块数据的索引 
        df_ = pd.DataFrame()
        df_['user_id']= df['user_id']
        for col in cols:
            # 用K-1块数据计算Target encoding，记录到字典
            grouped = df.iloc[train_ind][[col,target_feature]].groupby(col)[target_feature]
            # 用刚刚计算出的映射对这1块内容做Target encoding
            df_.iloc[val_ind,col+'_target_encoded_mean'] = df.iloc[val_ind][col].replace(dict(grouped.mean())).values
            df_.iloc[val_ind,col+'_target_encoded_median'] = df.iloc[val_ind][col].replace(dict(grouped.median())).values
            df_.iloc[val_ind,col+'_target_encoded_std'] = df.iloc[val_ind][col].replace(dict(grouped.std())).values
    return df_

In [None]:
# K折目标编码gender
train_kflod_target_gender = target_encode_kflod(target_input,feature,'gender')

In [None]:
# K折目标编码age
train_kflod_target_age = target_encode_kflod(target_input,feature,'age')