In [1]:
# Benchmark of different J-measure implementations (J_measure.py)
# Compares: toro (baseline), toro2, toro2_1, toro2_2, toro2_2 + joblib (parallel CPU)
# and toro2_2_torch_batch (PyTorch: CPU/CUDA/MPS).

import J_measure as jm
import importlib
importlib.reload(jm)          # Reloads the module in case it was modified
import D_measure as dm
import importlib
importlib.reload(dm)
import numpy as np
from timeit import default_timer as timer

# Series size (n) and number of signal pairs (m)
n = 1000
m = 100

# Test data: m pairs of random series of length n
a = np.random.rand(n, m)
b = np.random.rand(n, m)

# Arrays to store timing results for 100 repetitions
dt1 = np.zeros(100)  # toro
dt2 = np.zeros(100)  # toro2
dt3 = np.zeros(100)  # toro2_1
dt4 = np.zeros(100)  # toro2_2
dt5 = np.zeros(100)  # toro2_2 parallel (joblib)
dt6 = np.zeros(100)  # toro2_2_torch_batch (PyTorch)
dt7 = np.zeros(100)  # toroD
dt8 = np.zeros(100)  # toroD_joblib_batch (joblib)
dt9 = np.zeros(100)  # toroD_torch_batch (PyTorch)

# Outer loop: repeat the experiment 100 times to average execution times
for ii in range(100):
    # --- toro (baseline implementation with loops) ---
    tt1 = np.zeros(m)
    start1 = timer()
    for i in range(m):
        tt1[i] = jm.toro(a[:, i], b[:, i], 2, int(n/2))
    dt1[ii] = timer() - start1

    # --- toro2 (Vectorized NumPy with explicit quadrants) ---
    tt2 = np.zeros(m)
    start2 = timer()
    for i in range(m):
        tt2[i] = jm.toro2(a[:, i], b[:, i], 2, int(n/2))
    dt2[ii] = timer() - start2

    # --- toro2_1 (Vectorized NumPy with modular wrapping) ---
    tt3 = np.zeros(m)
    start3 = timer()
    for i in range(m):
        tt3[i] = jm.toro2_1(a[:, i], b[:, i], 2, int(n/2))
    dt3[ii] = timer() - start3

    # --- toro2_2 (Optimized NumPy using complex numbers) ---
    tt4 = np.zeros(m)
    start4 = timer()
    for i in range(m):
        tt4[i] = jm.toro2_2(a[:, i], b[:, i], 2, int(n/2))
    dt4[ii] = timer() - start4

    # --- toro2_2 column-wise parallel (CPU, joblib.Parallel) ---
    start5 = timer()
    tt5 = jm.toro2_2_joblib_batch(a, b, 2, int(n/2))
    dt5[ii] = timer() - start5

    # --- toro2_2_torch_batch (PyTorch, batch on CPU/CUDA/MPS) ---
    start6 = timer()
    tt6 = jm.toro2_2_torch_batch(a, b, 2, n//2)  # returns a tensor of size m
    dt6[ii] = timer() - start6

    # --- toroD (Vectorized NumPy)
    tt7 = np.zeros(m)
    start7 = timer()
    for i in range(m):
        tt7[i] = dm.toroD(a[:, i], b[:, i], 2, int(n/2))
    dt7[ii] = timer() - start7
    
    # --- toroD_joblib_batch column-wise parallel (CPU, joblib.Parallel) ---
    start8 = timer()
    tt8 = dm.toroD_joblib_batch(a, b, 2, int(n/2))
    dt8[ii] = timer() - start8

    # --- toroD_torch_batch (PyTorch, batch on CPU/CUDA/MPS) ---
    start9 = timer()
    tt9 = dm.toroD_torch_batch(a, b, 2, n//2)  # returns a tensor of size m
    dt9[ii] = timer() - start9

In [2]:
# Relative performance summary compared to the baseline implementation (toro)
print('Average time to analyze', m, 'pairs of', n, 'data points, averaged over 100 runs with toro:', np.mean(dt1))
print('toro2 is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt2),2), 'times faster than toro')
print('toro2_1 is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt3),2), 'times faster than toro')
print('toro2_2 is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt4)), 'times faster than toro')
print('toro2_2_joblib_batch is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt5),2), 'times faster than toro')
print('toro2_2_torch_batch is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt6),2), 'times faster than toro')
print('toroD is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt7),2), 'times faster than toro')
print('toroD_joblib_batch is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt8),2), 'times faster than toro and', np.round(np.mean(dt5)/np.mean(dt8),2), 'times faster than toro2_2_joblib_batch')
print('toroD_torch_batch is on average (100 runs)', np.round(np.mean(dt1)/np.mean(dt9),2), 'times faster than toro and', np.round(np.mean(dt6)/np.mean(dt9),2), 'times faster than toro2_2_torch_batch')

Average time to analyze 100 pairs of 1000 data points, averaged over 100 runs with toro: 1.303110475000003
toro2 is on average (100 runs) 49.87 times faster than toro
toro2_1 is on average (100 runs) 82.03 times faster than toro
toro2_2 is on average (100 runs) 98.0 times faster than toro
toro2_2_joblib_batch is on average (100 runs) 30.74 times faster than toro
toro2_2_torch_batch is on average (100 runs) 46.42 times faster than toro
toroD is on average (100 runs) 72.5 times faster than toro
toroD_joblib_batch is on average (100 runs) 35.18 times faster than toro and 1.14 times faster than toro2_2_joblib_batch
toroD_torch_batch is on average (100 runs) 630.2 times faster than toro and 13.57 times faster than toro2_2_torch_batch
