# Performance

In this section we analyze the performance of a very simple operation: the addition of two arrays $y = x_1+x_2$

In [None]:
def add1(x1, x2):
    "Add with direct memory access and list extension"
    y = []
    for i in range(len(x1)):
        y.append(x1[i] + x2[i])
    return y


def add2(x1, x2):
    "Add with indirect memory access and list extension"
    y = []
    for i1, i2 in zip(x1, x2):
        y.append(i1 + i2)
    return y


def add3(x1, x2):
    "Add with indirect memory access and list generation"
    return [i1 + i2 for i1, i2 in zip(x1, x2)]


def add4(x1, x2):
    "Add with buit-in numpy function"
    return x1 + x2

In [None]:
import numpy as np
from timeit import timeit
from pandas import DataFrame

times = DataFrame(index=[2 ** i for i in range(12)])

for size in times.index:
    x1, x2 = np.random.rand(2, size)
    for add in [
        add1,
        add2,
        add3,
        add4,
    ]:
        times.at[size, add.__name__] = timeit(lambda: add(x1, x2), number=1000)

times.plot(ylabel="Time x1000 [seconds]", xlabel="Size")

In [None]:
times.plot(logx=True, logy=True, ylabel="Time x1000 [seconds]", xlabel="Size")

In [None]:
perf = times.apply(lambda x: np.array(times.index) / x)
perf.plot(logx=True, logy=True, ylabel="kFLOPS", xlabel="Size")

In [None]:
times = DataFrame(index=[2 ** i for i in range(20)])

for size in times.index:
    x1, x2 = np.random.rand(2, size)
    for add in [
        add4,
    ]:
        times.at[size, add.__name__] = timeit(lambda: add(x1, x2), number=1000)

times.plot(logx=True, logy=True, ylabel="Time x1000 [seconds]", xlabel="Size")

perf = times.apply(lambda x: np.array(times.index) / x)
perf.plot(logx=True, logy=True, ylabel="kFLOPS", xlabel="Size")

# Multithreading


In Python, the `threading` module provides a very simple and intuitive API for spawning multiple threads in a program.

In [None]:
import threading

help(threading.Thread)

## Example 1: print

In [None]:
thread = threading.Thread(target=print, args=("Hi",))
thread

In [None]:
thread.start()

In [None]:
# the thread is already finished
thread.is_alive()

## Example 2: sleep

In [None]:
from time import sleep

thread = threading.Thread(target=sleep, args=(10,))
thread

In [None]:
thread.start()

In [None]:
# the thread is still running
thread.is_alive()

In [None]:
thread.join()
# now we are waiting to finish
print("done")

## Example 3: ThreadPool

In [None]:
from multiprocessing.pool import ThreadPool


def sleep_print(*args):
    sleep(1)
    print(*args)


pool = ThreadPool(8)
pool.map(sleep_print, range(8))

## Example 4: Race condition

In [None]:
def inc_x(n):
    # global variable x
    global x

    for i in range(n):
        x += 1


x = 0
inc_x(100)
x

In [None]:
for i in range(10):
    x = 0
    total = 2 ** 16
    threads = 8
    ThreadPool(threads).map(inc_x, [total // threads] * threads)
    print(x)

## Example 5: something real

In [None]:
pool = ThreadPool(8)

times = DataFrame(index=[2 ** i for i in range(3, 24)])

for size in times.index:
    x = np.random.rand(size)
    for threads in [1, 2, 4, 8]:
        xs = np.split(x, threads)
        times.at[size, threads] = timeit(lambda: pool.map(np.sin, xs), number=1)

times.plot(ylabel="Time [seconds]", xlabel="Size")

In [None]:
perf = times.apply(lambda x: np.array(times.index) / x)
perf.plot(logx=True, ylabel="sin per second", xlabel="Size")

## Example 6: multiprocessing vs multithreading

In [None]:
def countdown(n):
    while n > 0:
        n -= 1


count = 50000000
timeit(lambda: countdown(count), number=1)

In [None]:
timeit(lambda: pool.map(countdown, [count // 2] * 2), number=1)

In [None]:
from multiprocessing import Pool

pool2 = Pool(2)
timeit(lambda: pool2.map(countdown, [count // 2] * 2), number=1)

In this case multiprocessing is faster due to the Global Interpreter Lock ([GIL](https://realpython.com/python-gil/#:~:text=The%20Python%20Global%20Interpreter%20Lock%20or%20GIL%2C%20in%20simple%20words,at%20any%20point%20in%20time.))