diff --git a/.github/workflows/run_dataset_generation.yaml b/.github/workflows/run_dataset_generation.yaml new file mode 100644 index 0000000..69d9be0 --- /dev/null +++ b/.github/workflows/run_dataset_generation.yaml @@ -0,0 +1,35 @@ +name: Generate benchmarking dataset tables + +on: + workflow_dispatch: + +jobs: + test-ubuntu: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9"] + steps: + - uses: actions/checkout@v4 + - name: Setup python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - name: Install octave + run: | + sudo apt-get update + sudo apt-get install -y build-essential octave + - name: Install pyspi dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install . + - name: Run data generation + run: | + python tests/generate_benchmark_tables.py + - name: Upload artifact + uses: actions/upload-artifact@v3 + with: + name: benchmark-tables + path: tests/CML7_benchmark_tables_new.pkl diff --git a/.github/workflows/run_unit_tests.yaml b/.github/workflows/run_unit_tests.yaml index 157ca3d..2885034 100644 --- a/.github/workflows/run_unit_tests.yaml +++ b/.github/workflows/run_unit_tests.yaml @@ -32,7 +32,3 @@ jobs: - name: Run pyspi SPI unit tests run: | pytest -v ./tests/test_SPIs.py - - - - \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 71ef2e9..7188e37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyspi" -version = "1.0.1" +version = "1.0.2" authors = [ { name ="Oliver M. Cliff", email="oliver.m.cliff@gmail.com"}, ] diff --git a/pyspi/config.yaml b/pyspi/config.yaml index 266c075..5e9fbd2 100644 --- a/pyspi/config.yaml +++ b/pyspi/config.yaml @@ -310,6 +310,17 @@ statistic: max squared: True + # Gromov-Wasserstain Distance + GromovWasserstainTau: + labels: + - unsigned + - distance + - unordered + - nonlinear + - undirected + dependencies: + configs: + .statistics.causal: # Additive noise model AdditiveNoiseModel: diff --git a/pyspi/fast_config.yaml b/pyspi/fast_config.yaml index e421604..4cb2647 100644 --- a/pyspi/fast_config.yaml +++ b/pyspi/fast_config.yaml @@ -169,6 +169,17 @@ - mode: euclidean statistic: max squared: True + + # Gromov-Wasserstain Distance + GromovWasserstainTau: + labels: + - unsigned + - distance + - unordered + - nonlinear + - undirected + dependencies: + configs: .statistics.causal: # Additive noise model diff --git a/pyspi/statistics/distance.py b/pyspi/statistics/distance.py index 65634da..0ba6ae8 100644 --- a/pyspi/statistics/distance.py +++ b/pyspi/statistics/distance.py @@ -259,3 +259,62 @@ def bivariate(self, data, i=None, j=None): data.barycenter[self._mode][(j, i)] = data.barycenter[self._mode][(i, j)] return self._statfn(bc) + + +class GromovWasserstainTau(Undirected, Unsigned): + """Gromov-Wasserstain distance (GWTau)""" + + name = "Gromov-Wasserstain Distance" + identifier = "gwtau" + labels = ["unsigned", "distance", "unordered", "nonlinear", "undirected"] + + @staticmethod + def vec_geo_dist(x): + diffs = np.diff(x, axis=0) + distances = np.linalg.norm(diffs, axis=1) + return np.cumsum(distances) + + @staticmethod + def wass_sorted(x1, x2): + x1 = np.sort(x1)[::-1] # sort in descending order + x2 = np.sort(x2)[::-1] + + if len(x1) == len(x2): + res = np.sqrt(np.mean((x1 - x2) ** 2)) + else: + N, M = len(x1), len(x2) + i_ratios = np.arange(1, N + 1) / N + j_ratios = np.arange(1, M + 1) / M + + + min_values = np.minimum.outer(i_ratios, j_ratios) + max_values = np.maximum.outer(i_ratios - 1/N, j_ratios - 1/M) + + lam = np.where(min_values > max_values, min_values - max_values, 0) + + diffs_squared = (x1[:, None] - x2) ** 2 + my_sum = np.sum(lam * diffs_squared) + + res = np.sqrt(my_sum) + + return res + + @staticmethod + def gwtau(xi, xj): + timei = np.arange(len(xi)) + timej = np.arange(len(xj)) + traji = np.column_stack([timei, xi]) + trajj = np.column_stack([timej, xj]) + + vi = GromovWasserstainTau.vec_geo_dist(traji) + vj = GromovWasserstainTau.vec_geo_dist(trajj) + gw = GromovWasserstainTau.wass_sorted(vi, vj) + + return gw + + @parse_bivariate + def bivariate(self, data, i=None, j=None): + x, y = data.to_numpy()[[i, j]] + # insert compute SPI code here (computes on x and y) + stat = self.gwtau(x, y) + return stat diff --git a/setup.py b/setup.py index ccb1ed8..6b17053 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ 'data/standard_normal.npy', 'data/cml7.npy']}, include_package_data=True, - version='1.0.1', + version='1.0.2', description='Library for pairwise analysis of time series data.', author='Oliver M. Cliff', author_email='oliver.m.cliff@gmail.com', diff --git a/tests/CML7_benchmark_tables.pkl b/tests/CML7_benchmark_tables.pkl index cc96e9c..cc51736 100644 Binary files a/tests/CML7_benchmark_tables.pkl and b/tests/CML7_benchmark_tables.pkl differ diff --git a/tests/generate_benchmark_tables.py b/tests/generate_benchmark_tables.py new file mode 100644 index 0000000..9650af2 --- /dev/null +++ b/tests/generate_benchmark_tables.py @@ -0,0 +1,40 @@ +import numpy as np +import dill +from pyspi.calculator import Calculator + +""""Script to generate benchmarking dataset""" +def get_benchmark_tables(calc_list): + # get the spis from the first calculator + spis = list(calc_list[1].spis.keys()) + num_procs = calc_list[1].dataset.n_processes + # create a dict to store the mean and std for each spi + benchmarks = {key : {'mean': None, 'std': None} for key in spis} + num_trials = len(calc_list) + for spi in spis: + mpi_tensor = np.zeros(shape=(num_trials, num_procs, num_procs)) + for (index, calc) in enumerate(calc_list): + mpi_tensor[index, :, :] = calc.table[spi].to_numpy() + mean_matrix = np.mean(mpi_tensor, axis=0) # compute element-wise mean across the first dimension + std_matrix = np.std(mpi_tensor, axis=0) # compute element-wise std across the first dimension + benchmarks[spi]['mean'] = mean_matrix + benchmarks[spi]['std'] = std_matrix + + return benchmarks + +# load and transpose dataset +dataset = np.load("pyspi/data/cml7.npy").T + +# create list to store the calculator objects +store_calcs = list() + +for i in range(75): + np.random.seed(42) + calc = Calculator(dataset=dataset) + calc.compute() + store_calcs.append(calc) + +mpi_benchmarks = get_benchmark_tables(store_calcs) + +# save data +with open("tests/CML7_benchmark_tables_new.pkl", "wb") as f: + dill.dump(mpi_benchmarks, f) diff --git a/tests/test_SPIs.py b/tests/test_SPIs.py index d3412bd..139cf7c 100644 --- a/tests/test_SPIs.py +++ b/tests/test_SPIs.py @@ -61,11 +61,9 @@ def test_mpi(est, est_ob, mpi_benchmark, mpi_new, spi_warning_logger): mean_table = mpi_benchmark['mean'] std_table = mpi_benchmark['std'] - # check std stable for zeros and impute with smallest non-zero value - min_nonzero_std = np.nanmin(std_table[std_table > 0]) - std_table[std_table == 0] = min_nonzero_std - - + # check std table for zeros and impute with smallest non-zero value + std_table = np.where(std_table == 0, 1e-10, std_table) + # check that the shapes are equal assert mean_table.shape == mpi_new.shape, f"SPI: {est}| Different table shapes. " @@ -79,24 +77,26 @@ def test_mpi(est, est_ob, mpi_benchmark, mpi_new, spi_warning_logger): # get the module name for easy reference module_name = est_ob.__module__.split(".")[-1] - if (mpi_new == mpi_mean).all() == False: + if not np.allclose(mpi_new, mpi_mean): # tables are not equivalent, quantify the difference by z-scoring. diff = abs(mpi_new - mpi_mean) zscores = diff/std_table + idxs_greater_than_thresh = np.argwhere(zscores > zscore_threshold) + if len(idxs_greater_than_thresh) > 0: - sigs = list() - for idx in idxs_greater_than_thresh: - sigs.append(zscores[idx[0], idx[1]]) + sigs = zscores[idxs_greater_than_thresh[:, 0], idxs_greater_than_thresh[:, 1]] # get the max max_z = max(sigs) + # number of interactions - num_iteractions = (mpi_new.shape[0] * mpi_new.shape[1]) - mpi_new.shape[0] + num_interactions = mpi_new.size - mpi_new.shape[0] # count exceedances num_exceed = len(sigs) + if isSymmetric: # number of unique exceedences is half - num_exceed /= 2 - num_iteractions /= 2 + num_exceed //= 2 + num_interactions //= 2 - spi_warning_logger(est, module_name, max_z, int(num_exceed), int(num_iteractions)) + spi_warning_logger(est, module_name, max_z, int(num_exceed), int(num_interactions))