In [25]:
import numpy as np
import torch
from minai import mnist_load
from pathlib import Path

path_data = Path('data')
x_train, y_train, x_valid, y_valid = mnist_load.load_data(path_data)
x_train,y_train,x_valid,y_valid = map(torch.tensor, (x_train,y_train,x_valid,y_valid))

weights = torch.randn(784,10)
biases = torch.randn(10)
#m1 = x_valid[:10]
#m2 = weights


In [26]:

# reminder of our fastest so far, usign broadcasting:

def matmul(a,b):
    (ar,ac),(br,bc) = a.shape,b.shape
    c = torch.zeros(ar, bc)
    for i in range(ar):
#       c[i,j] = (a[i,:] * b[:,j]).sum()      # previous version
        c[i]   = (a[i,:,None] * b).sum(dim=0) # broadcast version
    return c
%timeit -n 5 _=matmul(x_train, weights)

477 ms ± 3.81 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)


Remember this was about 5000 faster then our naive python version

For this notebook, I will use pdl_gpu environemnt which can use cuda.

In [28]:
# einssum version
def matmul(a,b): return torch.einsum('ik,kj->ij', a, b)
%timeit -n 50 _=matmul(x_train, weights)

6.89 ms ± 89.4 μs per loop (mean ± std. dev. of 7 runs, 50 loops each)


Using native pytorch einsum, we can get another 100 x speed up

In [29]:
# cuda version
cuda0 = torch.device('cuda:0')
weights = weights.to(cuda0)
x_valid = x_train.to(cuda0)
# force compile
_ = matmul(x_valid, weights)
%timeit -n 50 _=matmul(x_valid, weights)

22.3 μs ± 806 ns per loop (mean ± std. dev. of 7 runs, 50 loops each)


And finally we get another order of 100x from using cuda. so that 1e6 times faster than naive python version