From https://stackoverflow.com/questions/52673285/performance-of-pandas-apply-vs-np-vectorize-to-create-new-column-from-existing-c

In [2]:
import pandas as pd
import numpy as np
import time

def divide(a, b):
    if b == 0:
        return 0.0
    return float(a)/b

# A_list = np.random.randint(1, 100, N)
# B_list = np.random.randint(1, 100, N)
# df = pd.DataFrame({'A': A_list, 'B': B_list})

NameError: name 'N' is not defined

In [4]:


for N in [1000, 10000, 100000, 1000000]:    

    print ('')
    A_list = np.random.randint(1, 100, N)
    B_list = np.random.randint(1, 100, N)
    df = pd.DataFrame({'A': A_list, 'B': B_list})

    start_epoch_sec = int(time.time())
    df['result'] = df.apply(lambda row: divide(row['A'], row['B']), axis=1)
    end_epoch_sec = int(time.time())
    result_apply = end_epoch_sec - start_epoch_sec

    start_epoch_sec = int(time.time())
    df['result2'] = np.vectorize(divide)(df['A'], df['B'])
    end_epoch_sec = int(time.time())
    result_vectorize = end_epoch_sec - start_epoch_sec


    print ('N=%d, df.apply: %d sec, np.vectorize: %d sec' % (N, result_apply, result_vectorize))

    # Make sure results from df.apply and np.vectorize match.
    assert(df['result'].equals(df['result2']))


N=1000, df.apply: 0 sec, np.vectorize: 0 sec

N=10000, df.apply: 1 sec, np.vectorize: 0 sec

N=100000, df.apply: 0 sec, np.vectorize: 0 sec

N=1000000, df.apply: 6 sec, np.vectorize: 0 sec


In [6]:
np.random.seed(0)
N = 10**5

%time list(map(divide, df['A'], df['B']))                                   # 43.9 ms
%time np.vectorize(divide)(df['A'], df['B'])                                # 48.1 ms
%time [divide(a, b) for a, b in zip(df['A'], df['B'])];                     # 49.4 ms
%time [divide(a, b) for a, b in df[['A', 'B']].itertuples(index=False)]     # 112 ms
%time df.apply(lambda row: divide(*row), axis=1, raw=True)                  # 760 ms
%time df.apply(lambda row: divide(*row), axis=1)              # 4.83 s
# %time df.apply(lambda row: divide(row['A'], row['B']), axis=1)              # 4.83 s
%time [divide(row['A'], row['B']) for _, row in df[['A', 'B']].iterrows()]; # 11.6 s

CPU times: user 170 ms, sys: 8.05 ms, total: 178 ms
Wall time: 177 ms
CPU times: user 127 ms, sys: 3.97 ms, total: 131 ms
Wall time: 131 ms
CPU times: user 167 ms, sys: 11.9 ms, total: 179 ms
Wall time: 178 ms
CPU times: user 353 ms, sys: 8.03 ms, total: 361 ms
Wall time: 361 ms


TypeError: divide() takes 2 positional arguments but 4 were given

TypeError: divide() takes 2 positional arguments but 4 were given

In [7]:
#ignore divide by 0 and possible np.inf
%time (df['A'] / df['B']);

CPU times: user 2.85 ms, sys: 0 ns, total: 2.85 ms
Wall time: 2.15 ms


In [8]:
# true vecorization
%time (df['A'] / df['B']).replace([np.inf,-np.inf],0);  # .644 ms
%time np.where(df['B'] == 0, 0, df['A'] / df['B']);     # .644 ms

CPU times: user 6.88 ms, sys: 72 µs, total: 6.96 ms
Wall time: 5.74 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.9 ms


In [9]:
#want more speedup? compile to C
from numba import njit

@njit
def divide(a, b):
    res = np.empty(a.shape)
    for i in range(len(a)):
        if b[i] != 0:
            res[i] = a[i] / b[i]
        else:
            res[i] = 0
    return res

%timeit divide(df['A'].values, df['B'].values);  # 717 µs

1.28 ms ± 91.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Using @njit(parallel=True) may provide a further boost for larger arrays.