In [1]:
%load_ext Cython

In [2]:
import pandas as pd
import numpy as np

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [4]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.1 s per loop


In [None]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 257 ms per loop


In [5]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_2)
print(diff)

0.0


## V3 重复变量合并

In [35]:
def target_mean_v3(data, y_name, x_name):
    len_result = data.shape[0]
    result = np.zeros(len_result)
    value_dict = dict()
    count_dict = dict()
    for i in range(len_result):
        x_value = data.loc[i, x_name]
        y_value = data.loc[i, y_name]
        if x_value not in value_dict:
            value_dict[x_value] = y_value
            count_dict[x_value] = 1
        else:
            value_dict[x_value] += y_value
            count_dict[x_value] += 1
    for i in range(len_result):
        x_value = data.loc[i, x_name]
        result[i] = (value_dict[x_value] - data.loc[i, y_name]) / (count_dict[x_value] - 1)
    return result


In [36]:
result_3 = target_mean_v3(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_3)
print(diff)

0.0


In [37]:
%%timeit
target_mean_v3(data, 'y', 'x')

10 loops, best of 3: 150 ms per loop


## V4 使用Cython定义变量

In [99]:
%%cython
cimport numpy as cnp
import numpy as np

cpdef target_mean_v4(data, y_name, x_name):
    cpdef int len_result = data.shape[0]
    cpdef dict value_dict = {}
    cpdef dict count_dict = {}
    cpdef cnp.ndarray[double, ndim=1] result = np.zeros([len_result], dtype=np.float64)
    cpdef cnp.ndarray[int, ndim=1] x_columns = np.array(data[x_name], dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] y_columns = np.array(data[y_name], dtype=np.intc)
    for i in range(len_result):
        x_value = x_columns[i]
        y_value = y_columns[i]
        if x_value not in value_dict:
            value_dict[x_value] = y_value
            count_dict[x_value] = 1
        else:
            value_dict[x_value] += y_value
            count_dict[x_value] += 1
    for i in range(len_result):
        x_value = x_columns[i]
        result[i] = (value_dict[x_value] - y_columns[i]) / (count_dict[x_value] - 1)
    return result

In [100]:
result_4 = target_mean_v4(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_4)
print(diff)

0.0


In [101]:
%%timeit
target_mean_v4(data, 'y', 'x')

1000 loops, best of 3: 1.05 ms per loop


## V5 用数组替换字典

In [43]:
%%cython
cimport numpy as cnp
import numpy as np

cpdef target_mean_v5(data, cnp.str y_name, cnp.str x_name):
    cpdef int len_result = data.shape[0]
    cpdef int len_array = len(data[x_name].unique())
    cpdef cnp.ndarray[int, ndim=1] value_array = np.zeros(len_array, dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] count_array = np.zeros(len_array, dtype=np.intc)
    cpdef cnp.ndarray[double, ndim=1] result = np.zeros([len_result], dtype=np.float64)
    cpdef cnp.ndarray[int, ndim=1] x_columns = np.array(data[x_name], dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] y_columns = np.array(data[y_name], dtype=np.intc)
    for i in range(len_result):
        value_array[x_columns[i]] +=  y_columns[i]
        count_array[x_columns[i]] += 1
    for i in range(len_result):
        result[i] = (value_array[x_columns[i]] - y_columns[i]) / (count_array[x_columns[i]] - 1)
    return result

In [44]:
result_5 = target_mean_v5(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_5)
print(diff)

0.0


In [45]:
%%timeit
target_mean_v5(data, 'y', 'x')

The slowest run took 4.91 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 156 µs per loop


## V6 关闭boundscheck wraparound

In [69]:
%%cython
cimport numpy as cnp
cimport cython
from cython.parallel cimport prange
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v6(data, cnp.str y_name, cnp.str x_name):
    cpdef int len_result = data.shape[0]
    cpdef int len_array = len(data[x_name].unique())
    cpdef cnp.ndarray[int, ndim=1] value_array = np.zeros(len_array, dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] count_array = np.zeros(len_array, dtype=np.intc)
    cpdef cnp.ndarray[double, ndim=1] result = np.zeros([len_result], dtype=np.float64)
    cpdef cnp.ndarray[int, ndim=1] x_columns = np.array(data[x_name], dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] y_columns = np.array(data[y_name], dtype=np.intc)
    for i in range(len_result):
        value_array[x_columns[i]] +=  y_columns[i]
        count_array[x_columns[i]] += 1
    for i in range(len_result):
        result[i] = (value_array[x_columns[i]] - y_columns[i]) / (count_array[x_columns[i]] - 1)
    return result

In [70]:
result_6 = target_mean_v6(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_6)
print(diff)

0.0


In [71]:
%%timeit
target_mean_v6(data, 'y', 'x')

The slowest run took 9.95 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 138 µs per loop


## V7 假设数组长度已知为10

In [74]:
%%cython
cimport numpy as cnp
cimport cython
from cython.parallel cimport prange
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(data, cnp.str y_name, cnp.str x_name):
    cpdef int len_result = data.shape[0]
    cpdef cnp.ndarray[int, ndim=1] value_array = np.zeros(10, dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] count_array = np.zeros(10, dtype=np.intc)
    cpdef cnp.ndarray[double, ndim=1] result = np.zeros([len_result], dtype=np.float64)
    cpdef cnp.ndarray[int, ndim=1] x_columns = np.array(data[x_name], dtype=np.intc)
    cpdef cnp.ndarray[int, ndim=1] y_columns = np.array(data[y_name], dtype=np.intc)
    for i in range(len_result):
        value_array[x_columns[i]] +=  y_columns[i]
        count_array[x_columns[i]] += 1
    for i in range(len_result):
        result[i] = (value_array[x_columns[i]] - y_columns[i]) / (count_array[x_columns[i]] - 1)
    return result

In [75]:
result_7 = target_mean_v7(data, 'y', 'x')
diff = np.linalg.norm(result_1 - result_7)
print(diff)

0.0


In [76]:
%%timeit
target_mean_v6(data, 'y', 'x')

The slowest run took 5.26 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 60.9 µs per loop
