# 均值编码 针对高基数定性特征(类别特征)的数据预处理。

在机器学习与数据挖掘中，不论是分类问题（classification）还是回归问题（regression），采集的数据常常会包括定性特征（categorical feature）。因为定性特征表示某个数据属于一个特定的类别，所以在数值上，定性特征值通常是从0到n的离散整数。例子：花瓣的颜色（红、黄、蓝）、性别（男、女）、地址、某一列特征是否存在缺失值（这种NA 指示列常常会提供有效的额外信息）。

In [1]:
import pandas as pd
import numpy as np
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [2]:
sales = pd.read_csv('datasets/sales_train.csv')

In [3]:
sales

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


## 将数据拆分为每个月单独的数据

In [4]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]

all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

## 此处只截取第33个月的数据

In [5]:
all_data = all_data[all_data.date_block_num == 33]

In [6]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target
10864839,2,30,33,0.0
10860073,2,31,33,1.0
10861046,2,32,33,0.0
10859871,2,33,33,0.0
10864840,2,40,33,0.0
...,...,...,...,...
10768834,59,22162,33,0.0
10769024,59,22163,33,0.0
10769690,59,22164,33,0.0
10771216,59,22166,33,0.0


## 对item_id进行均值编码

### 均值编码，不归一化

In [7]:
item_id_target_mean = all_data.groupby('item_id').target.mean()

all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)

print(all_data.target.mean())
all_data['item_target_enc'].fillna(0.29833, inplace=True) # 使用样本均值填充

# 计算相关系数
encoded_feature = all_data['item_target_enc'].values
print(np.corrcoef(all_data['target'].values, encoded_feature)[0][1])

0.29833901550140235
0.31136345046609326


In [8]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
10864839,2,30,33,0.0,0.022727
10860073,2,31,33,1.0,0.409091
10861046,2,32,33,0.0,0.500000
10859871,2,33,33,0.0,0.363636
10864840,2,40,33,0.0,0.022727
...,...,...,...,...,...
10768834,59,22162,33,0.0,0.227273
10769024,59,22163,33,0.0,0.590909
10769690,59,22164,33,0.0,0.340909
10771216,59,22166,33,0.0,0.250000


## KFold 均值编码，更加平均

In [9]:
from sklearn.model_selection import KFold
kf = KFold(5, shuffle=False)
all_data['item_target_enc'] = np.nan

for train_idx, val_idx in kf.split(all_data):
    X_train, X_val = all_data.iloc[train_idx], all_data.iloc[val_idx]
    all_data['item_target_enc'].iloc[val_idx] = X_val.item_id.map(X_train.groupby('item_id').target.mean())

all_data['item_target_enc'].fillna(0.29833, inplace=True)
    
#KFold均值编码后的相关系数
corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

nan


In [10]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
10864839,2,30,33,0.0,0.028571
10860073,2,31,33,1.0,0.428571
10861046,2,32,33,0.0,0.485714
10859871,2,33,33,0.0,0.400000
10864840,2,40,33,0.0,0.028571
...,...,...,...,...,...
10768834,59,22162,33,0.0,0.257143
10769024,59,22163,33,0.0,0.600000
10769690,59,22164,33,0.0,0.400000
10771216,59,22166,33,0.0,0.000000


## Leave-one-out 均值编码

核心：

`(Leave_one_out_sum - all_data.target) / (n_objects - 1)`

In [11]:
all_data['item_target_enc'] = np.nan

Leave_one_out_sum = all_data.item_id.map(all_data.groupby('item_id').target.sum())
n_objects = all_data.item_id.map(all_data.groupby('item_id').target.count())

all_data['item_target_enc'] = (Leave_one_out_sum - all_data.target) / (n_objects - 1)

all_data['item_target_enc'].fillna(0.29833, inplace=True)

encoded_feature = all_data['item_target_enc'].values
corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.2433021270751907


In [12]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
10864839,2,30,33,0.0,0.023256
10860073,2,31,33,1.0,0.395349
10861046,2,32,33,0.0,0.511628
10859871,2,33,33,0.0,0.372093
10864840,2,40,33,0.0,0.023256
...,...,...,...,...,...
10768834,59,22162,33,0.0,0.232558
10769024,59,22163,33,0.0,0.604651
10769690,59,22164,33,0.0,0.348837
10771216,59,22166,33,0.0,0.255814


## Smoothing 均值编码

<script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=default"></script>

$$ \frac{mean(target)*nrows + globalmean * alpha}{nrows+alpha} $$

In [13]:
alpha = 100
globalmean = 0.29833
nrows = all_data.groupby('item_id').target.size()
dfmean = all_data.groupby('item_id').target.mean()
all_data['item_target_enc'] = np.nan

all_data['item_target_enc'] = all_data.item_id.map((np.multiply(dfmean, nrows) + globalmean * alpha) / (nrows + alpha))
all_data['item_target_enc'].fillna(0.29833, inplace=True)

encoded_feature = all_data['item_target_enc'].values

corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.3113634504660932


In [14]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
10864839,2,30,33,0.0,0.214118
10860073,2,31,33,1.0,0.332174
10861046,2,32,33,0.0,0.359951
10859871,2,33,33,0.0,0.318285
10864840,2,40,33,0.0,0.214118
...,...,...,...,...,...
10768834,59,22162,33,0.0,0.276618
10769024,59,22163,33,0.0,0.387729
10769690,59,22164,33,0.0,0.311340
10771216,59,22166,33,0.0,0.283562


## Expanding 均值编码

In [15]:
all_data['item_target_enc'] = np.nan

cumsum = all_data.groupby('item_id').target.cumsum() - all_data.target
cumcount = all_data.groupby('item_id').target.cumcount()
all_data['item_target_enc'] = cumsum / cumcount

all_data['item_target_enc'].fillna(0.29833, inplace=True)

encoded_feature = all_data['item_target_enc'].values
corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.18436257958213445


In [16]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
10864839,2,30,33,0.0,0.298330
10860073,2,31,33,1.0,0.298330
10861046,2,32,33,0.0,0.298330
10859871,2,33,33,0.0,0.298330
10864840,2,40,33,0.0,0.298330
...,...,...,...,...,...
10768834,59,22162,33,0.0,0.232558
10769024,59,22163,33,0.0,0.604651
10769690,59,22164,33,0.0,0.348837
10771216,59,22166,33,0.0,0.255814


## 选择相关系数最高的那一个