Merge pull request #32 from CSchoel/v0.6

Fix for correlation dimension
CSchoel · Oct 29, 2023 · 5da1ee7 · 5da1ee7
2 parents e8b2955 + 5962f0e
commit 5da1ee7
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 14 deletions.
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,59 @@
+[tool.ruff]
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+# Same as Black.
+line-length = 88
+indent-width = 4
+
+# Assume Python 3.8
+target-version = "py38"
+
+[tool.ruff.lint]
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
+# McCabe complexity (`C901`) by default.
+select = ["E4", "E7", "E9", "F", "N", "D", "S", "PL", "NPY"]
+ignore = []
+
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/README.rst b/README.rst
@@ -1,8 +1,9 @@
 NOnLinear measures for Dynamical Systems (nolds)
 ================================================
 
-.. image:: https://travis-ci.com/CSchoel/nolds.svg?branch=master
-    :target: https://travis-ci.com/CSchoel/nolds
+.. image:: https://github.com/CSchoel/nolds/actions/workflows/ci.yaml/badge.svg
+    :target: https://github.com/CSchoel/nolds/actions/workflows/ci.yaml
+
 .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3814723.svg
    :target: https://doi.org/10.5281/zenodo.3814723
 
@@ -13,9 +14,10 @@ NOnLinear measures for Dynamical Systems (nolds)
 .. image:: https://api.codacy.com/project/badge/Grade/42c30d253b384e87a86b70c8fa0da6b2
     :target: https://www.codacy.com/app/christopher.schoelzel/nolds?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=CSchoel/nolds&amp;utm_campaign=Badge_Grade
 
-.. image:: https://codecov.io/gh/CSchoel/nolds/branch/master/graph/badge.svg
+.. image:: https://codecov.io/gh/CSchoel/nolds/branch/master/graph/badge.svg?token=Xgr82QKhFi
     :target: https://codecov.io/gh/CSchoel/nolds
 
+
 Nolds is a small numpy-based library that provides an implementation and a learning resource for nonlinear measures for dynamical systems based on one-dimensional time series.
 
 Currently the following measures are implemented:

diff --git a/nolds/measures.py b/nolds/measures.py
@@ -1825,7 +1825,7 @@ def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True,
     return [mH, sH]
 
 
-def corr_dim(data, emb_dim, rvals=None, dist=rowwise_euclidean,
+def corr_dim(data, emb_dim, lag=1, rvals=None, dist=rowwise_euclidean,
              fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None):
   """
   Calculates the correlation dimension with the Grassberger-Procaccia algorithm
@@ -1857,9 +1857,12 @@ def corr_dim(data, emb_dim, rvals=None, dist=rowwise_euclidean,
     This version of the algorithm is created for one-dimensional (scalar) time
     series. Therefore, before calculating C(r), a delay embedding of the time
     series is performed to yield emb_dim dimensional vectors
-    Y_i = [X_i, X_(i+1), X_(i+2), ... X_(i+embd_dim-1)]. Choosing a higher
-    value for emb_dim allows to reconstruct higher dimensional dynamics and
-    avoids "systematic errors due to corrections to scaling".
+    Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing
+    a higher value for emb_dim allows to reconstruct higher dimensional dynamics
+    and avoids "systematic errors due to corrections to scaling". Choosing a
+    higher value for lag allows to avoid overestimating correlation because
+    X_i ~= X_i+1, but it should also not be set too high to not underestimate
+    correlation due to exponential divergence of trajectories in chaotic systems.
 
   References:
     .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange
@@ -1911,19 +1914,35 @@ def corr_dim(data, emb_dim, rvals=None, dist=rowwise_euclidean,
       ``csums`` are the corresponding log(C(r)) and ``poly`` are the line
       coefficients (``[slope, intercept]``)
   """
+  # TODO determine lag in units of time instead of number of datapoints
   data = np.asarray(data)
 
   # TODO what are good values for r?
   # TODO do this for multiple values of emb_dim?
   if rvals is None:
     sd = np.std(data, ddof=1)
     rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03)
-  n = len(data)
-  orbit = delay_embedding(data, emb_dim, lag=1)
-  dists = np.array([dist(orbit, orbit[i]) for i in range(len(orbit))])
+  orbit = delay_embedding(data, emb_dim, lag=lag)
+  n = len(orbit)
+  dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64)
+  for i in range(len(orbit)):
+    # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1
+    # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches
+    # however, since both [cd_2] and [cd_3] specify to only compare i with j != i
+    # or j > i respectively, it is safe to assume that this was an oversight in
+    # [cd_1]
+    d = dist(orbit[i+1:], orbit[i])
+    dists[i+1:,i] = d  # fill column i
+    dists[i,i+1:] = d  # fill row i
   csums = []
   for r in rvals:
-    s = 1.0 / (n * (n - 1)) * np.sum(dists < r)
+    # NOTE: The [cd_1] and [cd_2] both use the factor 1/N^2 here.
+    # However, since we only use these values to fit a line in a log-log plot
+    # any multiplicative constant doesn't change the result since it will
+    # only result in an offset on the y-axis. Also, [cd_3] has a point here
+    # in that if we exclude self-matches in the numerator, it makes sense to
+    # also exclude self-matches from the denominator.
+    s = 1.0 / (n * (n - 1)) * np.sum(dists <= r)
     csums.append(s)
   csums = np.array(csums)
   # filter zeros from csums

diff --git a/nolds/test_measures.py b/nolds/test_measures.py
@@ -8,8 +8,8 @@
 import numpy as np
 
 # import internal module to test helping functions
-from . import measures as nolds
-from . import datasets
+from nolds import measures as nolds
+from nolds import datasets
 import unittest
 import warnings
 
@@ -333,7 +333,22 @@ def test_corr_dim(self):
     cd = nolds.corr_dim(data, 4, fit="poly")
     self.assertAlmostEqual(cd, 0.5, delta=0.15)
     # TODO test example for cd > 1
-  def test_corr_dim_logistic(self):
+
+  def test_lorenz(self):
+    # NOTE: The settings of n, discard, lag, emb_dim, and rvals were determined
+    # experimentally to find the smallest dataset that yields the results reported
+    # by Grassberger and Procaccia (1983).
+    discard = 5000
+    n = 5000
+    lag = 5
+    emb_dim = 5
+    data = datasets.lorenz_euler(n + discard, 10, 28, 8/3, start=(1,1,1), dt=0.012)
+    x = data[discard:,1]
+    rvals = nolds.logarithmic_r(1, np.e, 1.1)  # determined experimentally
+    cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=5, debug_plot=True)
+    self.assertAlmostEqual(cd, 2.05, delta=0.1)
+
+  def test_logistic(self):
     # TODO replicate tests with logistic map from grassberger-procaccia
     pass