In [38]:
%load_ext Cython  

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [39]:
%reload_ext Cython

In [40]:
import numpy as np
import pandas as pd


In [41]:

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [42]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [43]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))

In [44]:
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [45]:
%%cython --cplus 
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.unordered_map cimport unordered_map

import numpy as np
cimport numpy as cnp
cpdef target_mean_v3(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    cdef np.ndarray[double] x = np.asfortranarray(data[x_name], dtype=np.float64)

    target_mean_v3_impl(result, y, x, nrow)
    return result

cdef void target_mean_v3_impl(double[:] result, double[:] y, double[:] x, const long nrow):
    cdef dict value_dict = dict()
    cdef dict count_dict = dict()

    cdef long i
    for i in range(nrow):
        if x[i] not in value_dict.keys():
            value_dict[x[i]] = y[i]
            count_dict[x[i]] = 1
        else:
            value_dict[x[i]] += y[i]
            count_dict[x[i]] += 1

    i=0
    for i in range(nrow):
        result[i] = (value_dict[x[i]] - y[i])/(count_dict[x[i]]-1)


Error compiling Cython file:
------------------------------------------------------------
...

import numpy as np
cimport numpy as cnp
cpdef target_mean_v3(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
        ^
------------------------------------------------------------

/home/icuipi/.cache/ipython/cython/_cython_magic_70c3025850623a6cc3c7cbe97e1a8a70.pyx:8:9: 'np' is not a cimported module

Error compiling Cython file:
------------------------------------------------------------
...
import numpy as np
cimport numpy as cnp
cpdef target_mean_v3(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef np.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef np.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
        ^
------------------------------------------------------------

/home/icuipi/.cache/ipython/cython/_cython_ma

In [46]:
%%cython --cplus 
from cython.operator cimport dereference as deref, preincrement as inc
from libcpp.unordered_map cimport unordered_map

import numpy as np
cimport numpy as cnp


cpdef target_mean_v4(data, y_name, x_name):
    cdef long nrow = data.shape[0]
    cdef cnp.ndarray[double] result = np.asfortranarray(np.zeros(nrow), dtype=np.float64)
    cdef cnp.ndarray[double] y = np.asfortranarray(data[y_name], dtype=np.float64)
    cdef cnp.ndarray[double] x = np.asfortranarray(data[x_name], dtype=np.float64)

    target_mean_v4_impl(result, y, x, nrow)
    return result

cdef void target_mean_v4_impl(double[:] result, double[:] y, double[:] x, const long nrow):
    cdef unordered_map[double, double] value
    cdef unordered_map[double, double] count

    cdef long i
    for i in range(nrow):
        if value.find(x[i]) == value.end():
            value[x[i]] = y[i]
            count[x[i]] = 1
        else:
            value[x[i]] += y[i]
            count[x[i]] += 1

    i=0
    for i in range(nrow):
        result[i] = (value[x[i]] - y[i])/(count[x[i]]-1)

In [47]:
#%%timeit
#result_1 = target_mean_v1(data, 'y', 'x')

In [48]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

223 ms ± 4.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%%timeit
result_3 = target_mean_v3(data, 'y', 'x')

412 µs ± 7.56 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [50]:
%%timeit
result_4 = target_mean_v4(data, 'y', 'x')

411 µs ± 7.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
