In [1]:
%reset -f
import numpy as np
from timeit import timeit
from numba import jit
from timeit import default_timer as timer

### Exercise 1 - Unrolling for loops

In [2]:
N = 1000000
a = np.random.random(N)
b = np.random.random(N)

In [3]:
def no_unroll(N):
    sum=0
    i = 0
    while i < N:
        sum += a[i] * b[i]
        i += 1
    return sum

def unroll_two_step(N):
    i = 0
    while i < N//2:
        temp1 = a[2*i] * b[2*i] # evens
        temp2 = a[2*i+1] * b[2*i+1] #odds
        i += 1
    return temp1 + temp2

def unroll_four_step(N):
    i = 1
    while i < N//4:
        temp1 = a[4*i] * b[4*i]
        temp2 = a[4*i+1] * b[4*i+1]
        temp3 = a[4*i+2] * b[4*i+2]
        temp4 = a[4*i+3] * b[4*i+3]
        i += 1
    return temp1 + temp2 + temp3 + temp4

In [4]:
print("No unrolling")
%timeit no_unroll(N)

print("\n2-step unroll")
%timeit unroll_two_step(N)

print("\n4-step unroll")
%timeit unroll_four_step(N)

No unrolling
555 ms ± 45.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

2-step unroll
569 ms ± 38.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

4-step unroll
537 ms ± 33.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
@jit
def no_unroll(N):
    sum=0
    i = 0
    while i < N:
        sum += a[i] * b[i]
        i += 1
    return sum

@jit
def unroll_two_step(N):
    i = 0
    while i < N//2:
        temp1 = a[2*i] * b[2*i] # evens
        temp2 = a[2*i+1] * b[2*i+1] #odds
        i += 1
    return temp1 + temp2

@jit
def unroll_four_step(N):
    i = 1
    while i < N//4:
        temp1 = a[4*i] * b[4*i]
        temp2 = a[4*i+1] * b[4*i+1]
        temp3 = a[4*i+2] * b[4*i+2]
        temp4 = a[4*i+3] * b[4*i+3]
        i += 1
    return temp1 + temp2 + temp3 + temp4

In [6]:
print("No unrolling")
%timeit no_unroll(N)

print("\n2-step unroll")
%timeit unroll_two_step(N)

print("\n4-step unroll")
%timeit unroll_four_step(N)

No unrolling
1.9 ms ± 75.8 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)

2-step unroll
371 ns ± 9.97 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

4-step unroll
The slowest run took 12.65 times longer than the fastest. This could mean that an intermediate result is being cached.
1.65 μs ± 2.1 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Exercise 2 - Cache blocking

In [None]:
N = 10000
data = np.zeros(N)

@jit
def cache_blocking(l1_size, n_runs, data):
    b = 0
    n = 0
    i = 0
    while b in range(data.shape[0]//l1_size):
        blockstart = 0
        while n in range(n_runs):
            while i in range(l1_size):
                data[blockstart+i] = 2.3*data[blockstart+i]+1.2
                i += 1
        n += 1
    blockstart += l1_size
    b += 1
    
    
start_time = timer()
cache_blocking(5000, 5, data)
end_time = timer()
print(f"time elapsed: {end_time - start_time}")