Merge pull request #66 from DynamicsAndNeuralSystems/jmoo2880-pypi-pu…

…blish Add new SPI (GWTau) and benchmark generation script + workflow.
DynamicsAndNeuralSystems · Apr 10, 2024 · e2fe31d · e2fe31d
2 parents 6c8d9e0 + c337bdf
commit e2fe31d
Show file tree

Hide file tree

Showing 10 changed files with 171 additions and 19 deletions.
diff --git a/.github/workflows/run_dataset_generation.yaml b/.github/workflows/run_dataset_generation.yaml
@@ -0,0 +1,35 @@
+name: Generate benchmarking dataset tables
+
+on: 
+  workflow_dispatch:
+
+jobs:
+  test-ubuntu:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+      - name: Install octave
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential octave
+      - name: Install pyspi dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install .
+      - name: Run data generation
+        run: |
+          python tests/generate_benchmark_tables.py
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: benchmark-tables
+          path: tests/CML7_benchmark_tables_new.pkl
diff --git a/.github/workflows/run_unit_tests.yaml b/.github/workflows/run_unit_tests.yaml
@@ -32,7 +32,3 @@ jobs:
       - name: Run pyspi SPI unit tests
         run: |
           pytest -v ./tests/test_SPIs.py
-
-  
-
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pyspi"
-version = "1.0.1"
+version = "1.0.2"
 authors = [
     { name ="Oliver M. Cliff", email="oliver.m.cliff@gmail.com"},
 ]

diff --git a/pyspi/config.yaml b/pyspi/config.yaml
@@ -310,6 +310,17 @@
         statistic: max
         squared: True
 
+  # Gromov-Wasserstain Distance
+  GromovWasserstainTau:
+    labels:
+      - unsigned
+      - distance
+      - unordered
+      - nonlinear
+      - undirected
+    dependencies:
+    configs:
+
 .statistics.causal:
   # Additive noise model
   AdditiveNoiseModel:

diff --git a/pyspi/fast_config.yaml b/pyspi/fast_config.yaml
@@ -169,6 +169,17 @@
       - mode: euclidean
         statistic: max
         squared: True
+
+  # Gromov-Wasserstain Distance
+  GromovWasserstainTau:
+    labels:
+      - unsigned
+      - distance
+      - unordered
+      - nonlinear
+      - undirected
+    dependencies:
+    configs:
 
 .statistics.causal:
   # Additive noise model

diff --git a/pyspi/statistics/distance.py b/pyspi/statistics/distance.py
@@ -259,3 +259,62 @@ def bivariate(self, data, i=None, j=None):
             data.barycenter[self._mode][(j, i)] = data.barycenter[self._mode][(i, j)]
 
         return self._statfn(bc)
+
+
+class GromovWasserstainTau(Undirected, Unsigned):
+    """Gromov-Wasserstain distance (GWTau)"""
+
+    name = "Gromov-Wasserstain Distance"
+    identifier = "gwtau"
+    labels = ["unsigned", "distance", "unordered", "nonlinear", "undirected"]
+
+    @staticmethod
+    def vec_geo_dist(x):
+        diffs = np.diff(x, axis=0)
+        distances = np.linalg.norm(diffs, axis=1)
+        return np.cumsum(distances)
+
+    @staticmethod
+    def wass_sorted(x1, x2):
+        x1 = np.sort(x1)[::-1] # sort in descending order
+        x2 = np.sort(x2)[::-1] 
+
+        if len(x1) == len(x2):
+            res = np.sqrt(np.mean((x1 - x2) ** 2))
+        else:
+            N, M = len(x1), len(x2)
+            i_ratios = np.arange(1, N + 1) / N
+            j_ratios = np.arange(1, M + 1) / M
+
+
+            min_values = np.minimum.outer(i_ratios, j_ratios)
+            max_values = np.maximum.outer(i_ratios - 1/N, j_ratios - 1/M)
+
+            lam = np.where(min_values > max_values, min_values - max_values, 0)
+
+            diffs_squared = (x1[:, None] - x2) ** 2
+            my_sum = np.sum(lam * diffs_squared)
+
+            res = np.sqrt(my_sum)
+
+        return res
+
+    @staticmethod
+    def gwtau(xi, xj):
+        timei = np.arange(len(xi))
+        timej = np.arange(len(xj))
+        traji = np.column_stack([timei, xi])
+        trajj = np.column_stack([timej, xj])
+
+        vi = GromovWasserstainTau.vec_geo_dist(traji)
+        vj = GromovWasserstainTau.vec_geo_dist(trajj)
+        gw = GromovWasserstainTau.wass_sorted(vi, vj)
+
+        return gw
+
+    @parse_bivariate
+    def bivariate(self, data, i=None, j=None):
+        x, y = data.to_numpy()[[i, j]]
+        # insert compute SPI code here (computes on x and y)
+        stat = self.gwtau(x, y)
+        return stat
diff --git a/setup.py b/setup.py
@@ -61,7 +61,7 @@
                         'data/standard_normal.npy',
                         'data/cml7.npy']},
     include_package_data=True,
-    version='1.0.1',
+    version='1.0.2',
     description='Library for pairwise analysis of time series data.',
     author='Oliver M. Cliff',
     author_email='oliver.m.cliff@gmail.com',

diff --git a/tests/CML7_benchmark_tables.pkl b/tests/CML7_benchmark_tables.pkl
diff --git a/tests/generate_benchmark_tables.py b/tests/generate_benchmark_tables.py
@@ -0,0 +1,40 @@
+import numpy as np
+import dill
+from pyspi.calculator import Calculator
+
+""""Script to generate benchmarking dataset"""
+def get_benchmark_tables(calc_list):
+    # get the spis from the first calculator
+    spis = list(calc_list[1].spis.keys())
+    num_procs = calc_list[1].dataset.n_processes
+    # create a dict to store the mean and std for each spi
+    benchmarks = {key : {'mean': None, 'std': None} for key in spis}
+    num_trials = len(calc_list)
+    for spi in spis:
+        mpi_tensor = np.zeros(shape=(num_trials, num_procs, num_procs))
+        for (index, calc) in enumerate(calc_list):
+            mpi_tensor[index, :, :] = calc.table[spi].to_numpy()
+        mean_matrix = np.mean(mpi_tensor, axis=0) # compute element-wise mean across the first dimension
+        std_matrix = np.std(mpi_tensor, axis=0) # compute element-wise std across the first dimension
+        benchmarks[spi]['mean'] = mean_matrix
+        benchmarks[spi]['std'] = std_matrix
+
+    return benchmarks
+
+# load and transpose dataset
+dataset = np.load("pyspi/data/cml7.npy").T
+
+# create list to store the calculator objects
+store_calcs = list()
+
+for i in range(75):
+    np.random.seed(42)
+    calc = Calculator(dataset=dataset)
+    calc.compute()
+    store_calcs.append(calc)
+
+mpi_benchmarks = get_benchmark_tables(store_calcs)
+
+# save data 
+with open("tests/CML7_benchmark_tables_new.pkl", "wb") as f:
+    dill.dump(mpi_benchmarks, f)
diff --git a/tests/test_SPIs.py b/tests/test_SPIs.py
@@ -61,11 +61,9 @@ def test_mpi(est, est_ob, mpi_benchmark, mpi_new, spi_warning_logger):
     mean_table = mpi_benchmark['mean']
     std_table = mpi_benchmark['std']
 
-    # check std stable for zeros and impute with smallest non-zero value
-    min_nonzero_std = np.nanmin(std_table[std_table > 0])
-    std_table[std_table == 0] = min_nonzero_std
-
-
+    # check std table for zeros and impute with smallest non-zero value
+    std_table = np.where(std_table == 0, 1e-10, std_table)
+
     # check that the shapes are equal
     assert mean_table.shape == mpi_new.shape, f"SPI: {est}| Different table shapes. "
 
@@ -79,24 +77,26 @@ def test_mpi(est, est_ob, mpi_benchmark, mpi_new, spi_warning_logger):
     # get the module name for easy reference
     module_name = est_ob.__module__.split(".")[-1]
 
-    if (mpi_new == mpi_mean).all() == False:
+    if not np.allclose(mpi_new, mpi_mean):
         # tables are not equivalent, quantify the difference by z-scoring.
         diff = abs(mpi_new - mpi_mean)
         zscores = diff/std_table
+
         idxs_greater_than_thresh = np.argwhere(zscores > zscore_threshold)
+
         if len(idxs_greater_than_thresh) > 0:
-            sigs = list()
-            for idx in idxs_greater_than_thresh:
-                sigs.append(zscores[idx[0], idx[1]])
+            sigs = zscores[idxs_greater_than_thresh[:, 0], idxs_greater_than_thresh[:, 1]]
             # get the max
             max_z = max(sigs)
+
             # number of interactions
-            num_iteractions = (mpi_new.shape[0] * mpi_new.shape[1]) - mpi_new.shape[0]
+            num_interactions = mpi_new.size - mpi_new.shape[0]
             # count exceedances
             num_exceed = len(sigs)
+
             if isSymmetric:
                 # number of unique exceedences is half
-                num_exceed /= 2
-                num_iteractions /= 2
+                num_exceed //= 2
+                num_interactions //= 2
 
-            spi_warning_logger(est, module_name, max_z, int(num_exceed), int(num_iteractions))
+            spi_warning_logger(est, module_name, max_z, int(num_exceed), int(num_interactions))