# Pandas vs Numba code


[Original Material](https://pandas.pydata.org/pandas-docs/stable/enhancingperf.html)


In [1]:
import pandas as pd
import numpy as np
import numba

In [None]:
df = pd.DataFrame({'a': np.random.randn(1000),
                    'b': np.random.randn(1000),
                    'N': np.random.randint(100, 1000, (1000)),
                    'x': 'x'})

### Original Pandas Code

In [3]:
def f(x):
    return x * (x - 1)

def integrate_f(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f(a + i * dx)
    return s * dx

In [17]:
%%timeit 

df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

178 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Simple Numba Code

Copying a vectorized Pandas Code to use Numba 
- Considerable optimization without modifying the code (5x)

In [12]:
import numba

@numba.jit
def f_plainnumba(x):
    return x * (x - 1)

@numba.jit
def integrate_f_plainnumba(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f(a + i * dx)
    return s * dx

In [13]:
%%timeit 

df.apply(lambda x: integrate_f_plainnumba(x['a'], x['b'], x['N']), axis=1)

33.6 ms ± 4.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Numba Code

Numba still requires several code changes to achieve the best performance
- Type can be specified (how much does that influence?)
- Bound checks need to be performed to keep the same functionality of the original Pandas Code 
    - We need to take this into account


In [7]:
import numba

@numba.jit
def f_plain(x):
   return x * (x - 1)

@numba.jit
def integrate_f_numba(a, b, N):
   s = 0
   dx = (b - a) / N
   for i in range(N):
       s += f_plain(a + i * dx)
   return s * dx

@numba.jit
def apply_integrate_f_numba(col_a, col_b, col_N):
   n = len(col_N)
   result = np.empty(n, dtype='float64')
   assert len(col_a) == len(col_b) == n
   for i in range(n):
      result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i])
   return result

def compute_numba(df):
   result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values)
   return pd.Series(result, index=df.index, name='result')

In [9]:
%%timeit 
compute_numba(df)

1.05 ms ± 66.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
