说明：把提供的 target encoding 代码改为 cython 代码并比较速度区别

In [353]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

data.head

<bound method NDFrame.head of       y  x
0     1  2
1     1  9
2     0  7
3     1  3
4     0  1
...  .. ..
4995  0  9
4996  1  9
4997  0  1
4998  0  2
4999  1  9

[5000 rows x 2 columns]>

In [354]:
# 函数处理逻辑探索1
groupby_data = data[data.index != 1].groupby(['x'], as_index=False).agg(['mean', 'count'])
groupby_data

Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,mean,count
x,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.460501,519
1,0.489879,494
2,0.506849,511
3,0.535637,463
4,0.521127,497
5,0.539554,493
6,0.505859,512
7,0.537402,508
8,0.484725,491
9,0.506849,511


In [355]:
# 函数处理逻辑探索2
groupby_data.loc[data.loc[0, 'x'], ('y', 'mean')]

0.5068493150684932

In [356]:
def target_mean_v1(data, y_name, x_name):
    '''
    原版 v1
    1. 遍历 DataFrame，刨除遍历到的索引对应值外，根据 x 值 分组，统计平均值和总数
    2. 找到 data[i] 的 x 值，看 groupby_result[x] 对应的 y 列的 mean 值
    '''
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

target_mean_v1(data, 'y', 'x')

array([0.50588235, 0.50684932, 0.53846154, ..., 0.49087221, 0.50784314,
       0.50684932])

In [357]:
def target_mean_v2(data, y_name, x_name):
    '''
    课堂优化 v2
    '''
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

target_mean_v2(data, 'y', 'x')

array([0.50588235, 0.50684932, 0.53846154, ..., 0.49087221, 0.50784314,
       0.50684932])

#### 优化思路 v3 - Python 
1. 减少多次索引
2. Python 字典替换 Pandas DataFrame loc 操作

In [358]:
def target_mean_v3(data, y_name, x_name):
    '''
    优化思路 v3 - Python 
    1. 减少重复索引
    2. Python 字典替换 Pandas DataFrame loc 操作
    '''
    # data 行数
    shape = data.shape[0]
    result = np.zeros(shape)
    value_dict = {}
    count_dict = {}
    
    # x、y 对应值
    x_values = data[x_name].values
    y_values = data[y_name].values

    for i in range(shape):
        idx = x_values[i]
        count_dict[idx] = 1 if idx not in value_dict else count_dict[idx] + 1
        value_dict[idx] = y_values[i] if idx not in value_dict else value_dict[idx] + y_values[i]

    for i in range(shape):
        idx = x_values[i]
        result[i] = (value_dict[idx] - y_values[i]) / (count_dict[idx] - 1)
    return result

target_mean_v3(data, 'y', 'x')

array([0.50588235, 0.50684932, 0.53846154, ..., 0.49087221, 0.50784314,
       0.50684932])

#### 优化思路 v4 - Cython 
1. 指定变量类型

In [359]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [360]:
%%cython -a

import numpy as np 
cimport numpy as cnp

cpdef target_mean_v4(data, y_name, x_name):
    '''
    优化思路 v4 - Cython 
    1. 指定变量类型
    '''
    # data 行数
    cdef int shape = data.shape[0]
    # cdef cnp.ndarray[double, ndim=1] result = np.zeros(shape, dtype=np.float64)
    cdef cnp.ndarray[double] result = np.zeros(shape)
    value_dict = {}
    count_dict = {}
    
    # x、y 对应值
    cdef cnp.ndarray[long] x_values = data[x_name].values
    cdef cnp.ndarray[long] y_values = data[y_name].values

    for i in range(shape):
        idx = x_values[i]
        count_dict[idx] = 1 if idx not in value_dict else count_dict[idx] + 1
        value_dict[idx] = y_values[i] if idx not in value_dict else value_dict[idx] + y_values[i]

    for i in range(shape):
        idx = x_values[i]
        result[i] = (value_dict[idx] - y_values[i]) / (count_dict[idx] - 1)
    return result

#### 优化思路 v5 - Cython 
1. 去除检查数组是否出界，去除函数封装
2. 修改 range 循环用为 cython 语法

In [361]:
%%cython -a

import numpy as np 
cimport numpy as cnp
cimport cython

@cython.boundscheck(False) 
@cython.wraparound(False)
cpdef target_mean_v5(data, y_name, x_name):
    '''
    优化思路 v5 - Cython 
    1. 去除检查数组是否出界，去除函数封装
    2. 修改 range 循环用为更 cython 语法
    '''
    # data 行数
    cdef int shape = data.shape[0]
    cdef cnp.ndarray[double, ndim=1] result = np.zeros(shape, dtype=np.float64)
    value_dict = {}
    count_dict = {}
    
    # x、y 对应值
    cdef cnp.ndarray[long] x_values = data[x_name].values
    cdef cnp.ndarray[long] y_values = data[y_name].values

    cdef int i = 0
    for i from 0 <= i < shape:
        idx = x_values[i]
        count_dict[idx] = 1 if idx not in value_dict else count_dict[idx] + 1
        value_dict[idx] = y_values[i] if idx not in value_dict else value_dict[idx] + y_values[i]

    for i from 0 <= i < shape:
        idx = x_values[i]
        result[i] = (value_dict[idx] - y_values[i]) / (count_dict[idx] - 1)
    return result

#### 优化思路 v6 - Cython 
1. 在 cython 中运用 openmp，用 prange 改写循环
2. 使用数组改写 Python 字典和 ndarray：Indexing Python object not allowed without gil
3. 添加 boundscheck 和 wraparound，速度变快

In [362]:
%%cython -a

import numpy as np 
cimport numpy as cnp
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v6(data, y_name, x_name):
    '''
    优化思路 v6 - Cython 
    1. 在 cython 中运用 openmp，用 prange 改写循环
    2. 使用数组改写 Python 字典和 ndarray：Indexing Python object not allowed without gil
    3. 添加 boundscheck 和 wraparound，速度变快
    '''
    # data 行数
    cdef int shape = data.shape[0]
    # cdef double[:] result = np.zeros(shape, dtype=np.float64)
    # cdef double[:] value_arr = np.zeros(10)
    # cdef double[:] count_arr = np.zeros(10)
    cdef cnp.ndarray[double, ndim=1] result = np.zeros(shape, dtype=np.float64)
    cdef cnp.ndarray[double, ndim=1] value_arr = np.zeros(10)
    cdef cnp.ndarray[double, ndim=1] count_arr = np.zeros(10)
    
    # x、y 对应值
    # cdef long[:] x_values = data[x_name].values
    # cdef long[:] y_values = data[y_name].values
    cdef cnp.ndarray[long] x_values = data[x_name].values
    cdef cnp.ndarray[long] y_values = data[y_name].values

    cdef int i = 0
    # prange() can only be used without the GIL
    for i in prange(shape, nogil=True):
        value_arr[x_values[i]] += y_values[i]
        count_arr[x_values[i]] += 1

    for i in prange(shape, nogil=True):
        result[i] = (value_arr[x_values[i]] - y_values[i]) / (count_arr[x_values[i]] - 1)
    return result

#### 优化思路 v7 - Cython 
1. 使用 ray
感觉没有发挥 ray 的优势，再琢磨一下

In [368]:
# !pip install ray
import ray
import numpy as np 

ray.shutdown()
ray.init()

@ray.remote
def target_mean_by_ray(shape, x_values, y_values):
    value_dict = {}
    count_dict = {}
    result = np.zeros(shape)

    for i in range(shape):
        idx = x_values[i]
        count_dict[idx] = 1 if idx not in value_dict else count_dict[idx] + 1
        value_dict[idx] = y_values[i] if idx not in value_dict else value_dict[idx] + y_values[i]

    for i in range(shape):
        result[i] = (value_dict[x_values[i]] - y_values[i]) / (count_dict[x_values[i]] - 1)
    return result


def target_mean_v7(data, y_name, x_name):
    '''
    优化思路 v7 - Python 
    1. 使用 ray
    '''
    # data 行数
    shape = data.shape[0]
    
    # x、y 对应值
    x_values = data[x_name].values
    y_values = data[y_name].values
    
    futures = [target_mean_by_ray.remote(shape, x_values, y_values)]
    ray.get(futures)
    # print(ray.get(futures))

2021-01-10 07:43:04,920	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [373]:
target_mean_v7(data, 'y', 'x')

#### 对比

In [370]:
def compare(res_1, res_2):
    print(np.linalg.norm(res_1 - res_2))

In [371]:
res_v1 = target_mean_v1(data, 'y', 'x')
compare(res_v1, target_mean_v2(data, 'y', 'x'))
compare(res_v1, target_mean_v3(data, 'y', 'x'))
compare(res_v1, target_mean_v4(data, 'y', 'x'))
compare(res_v1, target_mean_v5(data, 'y', 'x'))
compare(res_v1, target_mean_v6(data, 'y', 'x'))

0.0
0.0
0.0
0.0
0.0


In [372]:
# %%timeit -r 100
%timeit -n 1 target_mean_v1(data, 'y', 'x')
%timeit -n 1 target_mean_v2(data, 'y', 'x')
%timeit -n 100 target_mean_v3(data, 'y', 'x')
%timeit -n 100 target_mean_v4(data, 'y', 'x')
%timeit -n 100 target_mean_v5(data, 'y', 'x')
%timeit -n 100 target_mean_v6(data, 'y', 'x')
%timeit -n 100 target_mean_v7(data, 'y', 'x')

1 loop, best of 3: 29.8 s per loop
1 loop, best of 3: 287 ms per loop
100 loops, best of 3: 9.23 ms per loop
100 loops, best of 3: 1.19 ms per loop
100 loops, best of 3: 1.15 ms per loop
100 loops, best of 3: 37.8 µs per loop
100 loops, best of 3: 13.3 ms per loop
