# Runtime experiments for CPU and GPU benchmarking of our algorithms

In [1]:
!lscpu |grep 'Model name'

Model name:          Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz


In [2]:
!nvidia-smi

Mon Oct 19 21:43:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P2000        On   | 00000000:91:00.0  On |                  N/A |
| 47%   36C    P5     8W /  75W |   4722MiB /  5050MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Imports

In [36]:
from jax import jit, devices, make_jaxpr
from jax.config import config
import jax.numpy as jnp
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import tqdm

from parsmooth.parallel import ieks, icks
from parsmooth.sequential import ieks as seq_ieks, icks as seq_icks
from parsmooth.models.bearings import get_data, make_parameters, plot_bearings
from parsmooth.utils import MVNormalParameters

In [30]:
# rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Computer Modern Sans serif']

### Input parameters

In [5]:
s1 = jnp.array([-1.5, 0.5])  # First sensor location
s2 = jnp.array([1., 1.])  # Second sensor location
r = 5.  # Observation noise (stddev) - Large because IEKS is not very stable
dt = 0.01  # discretization time step
x0 = jnp.array([0.1, 0.2, 1, 0])  # initial true location
qc = 0.1  # noise - Large because IEKS is not very stable
qw = 0.1  # noise - Small because IEKS is not very stable

T = 100  # number of observations

### Get parameters

In [6]:
Q, R, observation_function, transition_function = make_parameters(qc, qw, r, dt, s1, s2)

In [7]:
transition_function = jnp.vectorize(transition_function, signature="(m)->(m)")
observation_function = jnp.vectorize(observation_function, signature="(m)->(d)")

### Get data

In [8]:
ts, true_states, observations = get_data(x0, dt, r, T, s1, s2, qw, random_state=42)

### We can now run the filter

Initial state guess

In [9]:
m = jnp.array([-1., -1., 0., 0., 0.])
P = jnp.eye(5)

initial_guess = MVNormalParameters(m, P)
initial_linearization_points = jnp.zeros((T, 5), dtype=m.dtype)
initial_linearization_covariances = jnp.repeat(jnp.eye(5).reshape(1, 5, 5), T, axis=0)

### Sequential vs Parallel computation time comparison

In [10]:
gpu_par_ieks = jit(ieks, static_argnums=(2, 4, 7), backend="gpu")
cpu_par_ieks = jit(ieks, static_argnums=(2, 4, 7), backend="cpu")

gpu_seq_ieks = jit(seq_ieks, static_argnums=(2, 4, 7), backend="gpu")
cpu_seq_ieks = jit(seq_ieks, static_argnums=(2, 4, 7), backend="cpu")

gpu_par_icks = jit(icks, static_argnums=(2, 4, 7), backend="gpu")
cpu_par_icks = jit(icks, static_argnums=(2, 4, 7), backend="cpu")

gpu_seq_icks = jit(seq_icks, static_argnums=(2, 4, 7), backend="gpu")
cpu_seq_icks = jit(seq_icks, static_argnums=(2, 4, 7), backend="cpu")

In [11]:
def profile_smoother(s_method, lengths, n_runs=1, n_iter=10):
    res_mean = []
    for j in tqdm.tqdm(lengths):
        observations_slice = observations[:j]
        init_linearizations_points_slice = initial_linearization_points[:j]
        init_linearizations_covs_slice = initial_linearization_covariances[:j]
        init_linearizations_states = MVNormalParameters(init_linearizations_points_slice, init_linearizations_covs_slice)
        args = initial_guess, observations_slice, transition_function, Q, observation_function, R, init_linearizations_states, n_iter
        s = s_method(*args)  # this is a call used for compiling the function, this is a bit slow at the moment in JAX and shouldn't be taken into account for benchmarking.
                             # they are currently working on AOT compilation, which would then reduce the overhead substantially.   
        s.mean.block_until_ready()
        run_times = []
        for _ in range(n_runs):
            tic = time.time()
            s_states = s_method(*args)
            s_states.mean.block_until_ready()
            toc = time.time()
            run_times.append(toc - tic)
        res_mean.append(np.mean(run_times))
    return np.array(res_mean)

Let's now run the sequential vs the parallel implementation to see the performance gain coming from such parallelisation

In [12]:
lengths_space = np.logspace(1, np.log10(T), num=20).astype(np.int32)

In [13]:
gpu_par_ieks_time = profile_smoother(gpu_par_ieks, lengths_space)
cpu_par_ieks_time = profile_smoother(cpu_par_ieks, lengths_space)

gpu_seq_ieks_time = profile_smoother(gpu_seq_ieks, lengths_space)
cpu_seq_ieks_time = profile_smoother(cpu_seq_ieks, lengths_space)

gpu_par_icks_time = profile_smoother(gpu_par_icks, lengths_space)
cpu_par_icks_time = profile_smoother(cpu_par_icks, lengths_space)

gpu_seq_icks_time = profile_smoother(gpu_seq_icks, lengths_space)
cpu_seq_icks_time = profile_smoother(cpu_seq_icks, lengths_space)


100%|██████████| 3/3 [01:58<00:00, 39.46s/it]
100%|██████████| 3/3 [00:07<00:00,  2.49s/it]
100%|██████████| 3/3 [02:21<00:00, 47.26s/it]
100%|██████████| 3/3 [00:08<00:00,  2.96s/it]


In [41]:
data = np.stack([
                 gpu_par_ieks_time,
                 cpu_par_ieks_time,
                 gpu_seq_ieks_time,
                 cpu_seq_ieks_time,
                 gpu_par_icks_time,
                 cpu_par_icks_time,
                 gpu_seq_icks_time,
                 cpu_seq_icks_time],
               axis=1)

columns = ["GPU_par_IEKS",
           "CPU_par_IEKS",
           "GPU_seq_IEKS",
           "CPU_seq_IEKS",
           "GPU_par_ICKS",
           "CPU_par_ICKS",
           "GPU_seq_ICKS",
           "CPU_seq_ICKS"]

df = pd.DataFrame(index=lengths_space, data=data, colums=columns)
df.to_csv("...")