In [77]:
from numba import jit
import numpy as np
from time import time
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

Define the basic preprocessing function as a numba jit-compiled function

In [44]:
@jit
def my_tensordot(X,Y,p):
    return np.abs(np.tensordot(X,Y[p,:,:],axes=2))

Set up basic simulation

In [83]:
pic_size = 148
pic_num = 4500
filter_num = 500
X = np.random.rand(pic_num,pic_size,pic_size)
Y = np.random.rand(filter_num,pic_size,pic_size)

Compare numba to standard tensordot...

In [57]:
%%timeit
my_tensordot(X,Y,0)

10 loops, best of 3: 69 ms per loop


In [58]:
%%timeit
np.abs(np.tensordot(X,Y[0,:,:],axes=2))

10 loops, best of 3: 68.9 ms per loop


^^^Absolutely no benefit. Now let's compare job-lib multithreading to standard...

In [68]:
def foo(A,B):
   return np.abs(np.tensordot(A,B,axes=2)) 

Standard...

In [69]:
start = time()
outputs = np.zeros((pic_num,filter_num))
for p in range(filter_num):
    outputs[:,p]=foo(X,Y[p,:,:])
print '%f' %(time()-start)    

35.441913


...job-lib'ed

In [70]:
start = time()
new_outs = Parallel(n_jobs=-1)(delayed(foo)(X,Y[p,:,:]) for p in range(filter_num))
print '%f' %(time()-start)    

23.717820


multi-threading gives a significant speedup for this loop.

In [79]:
print np.array(new_outs).T.shape
print outputs.shape
plt.figure
plt.scatter(np.array(new_outs).T.ravel(),outputs.ravel())
plt.show()

(4500, 500)
(4500, 500)


...and the outputs match!

One more time: let's try to numba the loop, instead of just the function

In [81]:
@jit
def baz(A,B):
    outputs = np.zeros((pic_num,filter_num))
    for p in range(filter_num):
        outputs[:,p]=foo(A,B[p,:,:])
    return outputs

In [82]:
start = time()
blarg = baz(X,Y)
print '%f' %(time()-start)

35.319017


No joy. job-lib > numba = standard.