In [1]:
N = 4
BLOCK_SIZE = 2

import numpy as np

A = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]).astype(np.float32)
B = np.array([[1, -2, 3, -4], [5, -6, 7, -8], [-9, 10, -11, -12], [-13, -14, 15, -16]]).astype(np.float32)

# [[ -68  -40   44 -120]
#  [-132  -88  100 -280]
#  [-196 -136  156 -440]
#  [-260 -184  212 -600]]
A @ B

array([[ -68.,  -40.,   44., -120.],
       [-132.,  -88.,  100., -280.],
       [-196., -136.,  156., -440.],
       [-260., -184.,  212., -600.]], dtype=float32)

We can use python functionality to build build a block slicer to make accessing the individual blocks much easier:

In [13]:
blocks: list[list[tuple[slice, slice]]] = [
    [
        (slice(y * BLOCK_SIZE, (y + 1) * BLOCK_SIZE),
         slice(x * BLOCK_SIZE, (x + 1) * BLOCK_SIZE))
         for x in range(N // BLOCK_SIZE)
    ]
    for y in range(N // BLOCK_SIZE)
]

Using this slcier we can access the blocks of the matrices directly and apply naive matrix multiplication within each block.

In [36]:
C = np.zeros((N, N))
for y in range(N // BLOCK_SIZE):
    for x in range(N // BLOCK_SIZE):
        temp = np.zeros((BLOCK_SIZE, BLOCK_SIZE))
        for k in range(N // BLOCK_SIZE):
            temp += A[blocks[y][k]] @ B[blocks[k][x]]
        C[blocks[y][x]] = temp
assert (C == A @ B).all()

In [37]:
C = np.zeros((N, N))
for y in range(N // BLOCK_SIZE):
    for x in range(N // BLOCK_SIZE):
        for k in range(N // BLOCK_SIZE):
            C[blocks[y][x]] += A[blocks[y][k]] @ B[blocks[k][x]]
assert (C == A @ B).all()

If we multiply the entire of of blocks with the entire column of blocks in one pass we'd get the following simplified variant:

In [38]:
C = np.zeros((N, N))
for block_row in blocks:
    for block in block_row:
        C[block] = (A[block[0], :]) @ B[:, block[1]]
assert (C == A @ B).all()

If we now inline the block slicers we're already at the stage that we can implement this procedure in C!

In [48]:
C = np.zeros((N, N))
for y in range(N // BLOCK_SIZE):
    for x in range(N // BLOCK_SIZE):
        for r in range(N // BLOCK_SIZE):
            for y_loc in range(BLOCK_SIZE):
                for x_loc in range(BLOCK_SIZE):
                    temp = 0
                    for i in range(BLOCK_SIZE):
                        A_val = A[y * BLOCK_SIZE + y_loc, r * BLOCK_SIZE + i]
                        B_val = B[r * BLOCK_SIZE + i, x * BLOCK_SIZE + x_loc]
                        temp += A_val * B_val 
                    C[y * BLOCK_SIZE + y_loc, x * BLOCK_SIZE + x_loc] += temp
        print(C)
assert (C == A@B).all()

[[11.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
[[ 11. -14.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]]
[[ 11. -14.   0.   0.]
 [ 35.   0.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]]
[[ 11. -14.   0.   0.]
 [ 35. -46.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]]
[[-68. -14.   0.   0.]
 [ 35. -46.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]]
[[-68. -40.   0.   0.]
 [ 35. -46.   0.   0.]
 [  0.   0.   0.   0.]
 [  0.   0.   0.   0.]]
[[ -68.  -40.    0.    0.]
 [-132.  -46.    0.    0.]
 [   0.    0.    0.    0.]
 [   0.    0.    0.    0.]]
[[ -68.  -40.    0.    0.]
 [-132.  -88.    0.    0.]
 [   0.    0.    0.    0.]
 [   0.    0.    0.    0.]]
[[ -68.  -40.   17.    0.]
 [-132.  -88.    0.    0.]
 [   0.    0.    0.    0.]
 [   0.    0.    0.    0.]]
[[ -68.  -40.   17.  -20.]
 [-132.  -88.    0.    0.]
 [   0.    0.    0.    0.]
 [   0.    0.    0.    0.]]
[[ -68.  -40.   17.  -