In [None]:
import numpy as np
import numba

In [None]:
n = 1600
A = np.random.rand(n, n)
B = np.random.rand(n, n)
C = np.zeros((n, n), dtype=np.float64)

In [None]:
%%timeit
C = A @ B

In [None]:
@numba.jit(nopython=True)
def matmul_numba(A, B, C):
    n = A.shape[0]
    for i in numba.prange(n):
        for j in numba.prange(n):
            sum = 0.0
            for k in range(n):
                sum += A[i, k] * B[k, j]
            C[i, j] = sum

In [None]:
matmul_numba(A, B, C)

In [None]:
%%timeit
matmul_numba(A, B, C)

In [None]:
1000/25

In [None]:
CS = 20
NCHUNKS = int(A.shape[0]/CS)

@numba.jit(nopython=True, parallel=True)
def dot_chunked(A, B, C):
    for i in numba.prange(NCHUNKS):
        for j in numba.prange(NCHUNKS):
            for k in range(NCHUNKS):
                for ii in range(i * CS, (i + 1) * CS):
                    for jj in range(j * CS, (j + 1) * CS):
                        for kk in range(k * CS, (k + 1) * CS):
                            C[ii, jj] += A[ii, kk] * B[kk, jj]

In [None]:
C = np.zeros((n, n), dtype=np.float64)

In [None]:
dot_chunked(A, B, C)

In [None]:
%%timeit
dot_chunked(A, B, C)

In [None]:
CC = C.copy()

In [None]:
np.allclose(C, CC)

In [None]:
C[:3, :3]

In [None]:
CC[:3, :3]

In [None]:
C = A @ B
C[:3, :3]

In [None]:
%load_ext cython

In [None]:
%%cython --compile-args=-O3
# cython: language_level=3
import numpy as np
cimport numpy as cnp

def matrix_multiply(cnp.ndarray[cnp.float64_t, ndim=2] A,
                    cnp.ndarray[cnp.float64_t, ndim=2] B,
                    cnp.ndarray[cnp.float64_t, ndim=2] C):
    cdef int n = A.shape[0]
    cdef int i, j, k
    cdef double sum
    for i in range(n):
        for j in range(n):
            sum = 0.0
            for k in range(n):
                sum += A[i, k] * B[k, j]
            C[i, j] = sum


In [None]:
%%time
matrix_multiply(A, B, C)

In [None]:
%%time
matmul_numba(A, B, C)