Merge 05fadb2 into e75b7ef

Crunch-io · Jan 18, 2019 · 8b7e388 · 8b7e388
2 parents e75b7ef + 05fadb2
commit 8b7e388
Show file tree

Hide file tree

Showing 11 changed files with 1,788 additions and 8 deletions.
diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py
@@ -713,6 +713,17 @@ def zscore(self, weighted=True, prune=False, hs_dims=None):
         res = [s.zscore(weighted, prune, hs_dims) for s in self.slices]
         return np.array(res) if self.ndim == 3 else res[0]
 
+    def pairwise_pvals(self, axis=0):
+        """Return matrices of column-comparison p-values as list of numpy.ndarrays.
+
+        Square, symmetric matrix along *axis* of pairwise p-values for the
+        null hypothesis that col[i] = col[j] for each pair of columns.
+
+        *axis* (int): axis along which to perform comparison. Only columns (0)
+        are implemented currently.
+        """
+        return [slice_.pairwise_pvals(axis=axis) for slice_ in self.slices]
+
     def _adjust_axis(self, axis):
         """Return raw axis/axes corresponding to apparent axis/axes.
 

diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py
@@ -14,6 +14,7 @@
 
 from cr.cube.enum import DIMENSION_TYPE as DT
 from cr.cube.measures.scale_means import ScaleMeans
+from cr.cube.measures.pairwise_pvalues import PairwisePvalues
 from cr.cube.util import compress_pruned, lazyproperty, memoize
 
 
@@ -306,12 +307,26 @@ def table_name(self):
         table_name = self._cube.labels()[0][self._index]
         return "%s: %s" % (title, table_name)
 
+    def pairwise_pvals(self, axis=0):
+        """Return square symmetric matrix of pairwise column-comparison p-values.
+
+        Square, symmetric matrix along *axis* of pairwise p-values for the
+        null hypothesis that col[i] = col[j] for each pair of columns.
+
+        *axis* (int): axis along which to perform comparison. Only columns (0)
+        are implemented currently.
+        """
+        if axis != 0:
+            raise NotImplementedError("Pairwise comparison only implemented for colums")
+        return PairwisePvalues(self, axis=axis).values
+
     def pvals(self, weighted=True, prune=False, hs_dims=None):
-        """Return 2D ndarray with calculated p-vals.
+        """Return 2D ndarray with calculated P values
 
-        This function calculates statistically significant results for
-        categorical contingency tables. The values are calculated for 2D tables
-        only.
+        This function calculates statistically significant cells for
+        categorical contingency tables under the null hypothesis that the
+        row and column variables are independent (uncorrelated).
+        The values are calculated for 2D tables only.
 
         :param weighted: Use weighted counts for zscores
         :param prune: Prune based on unweighted counts
@@ -325,15 +340,22 @@ def pvals(self, weighted=True, prune=False, hs_dims=None):
         return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals
 
     def zscore(self, weighted=True, prune=False, hs_dims=None):
-        """Return ndarray with slices's zscore measurements.
+        """Return ndarray with slices's standardized residuals (Z-scores).
+
+        (Only applicable to a 2D contingency tables.) The Z-score or
+        standardized residual is the difference between observed and expected
+        cell counts if row and column variables were independent divided
+        by the residual cell variance. They are assumed to come from a N(0,1)
+        or standard Normal distribution, and can show which cells deviate from
+        the null hypothesis that the row and column variables are uncorrelated.
 
-        Zscore is a measure of statistical signifficance of observed vs.
-        expected counts. It's only applicable to a 2D contingency tables.
+        See also *pairwise_chisq*, *pairwise_pvals* for a pairwise column-
+        or row-based test of statistical significance.
 
         :param weighted: Use weighted counts for zscores
         :param prune: Prune based on unweighted counts
         :param hs_dims: Include headers and subtotals (as NaN values)
-        :returns zscore: ndarray representing zscore measurements
+        :returns zscore: ndarray representing cell standardized residuals (Z)
         """
         counts = self.as_array(weighted=weighted)
         total = self.margin(weighted=weighted)

diff --git a/src/cr/cube/distributions/__init__.py b/src/cr/cube/distributions/__init__.py
diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py
@@ -0,0 +1,177 @@
+# encoding: utf-8
+# pylint: disable=too-few-public-methods, invalid-name
+
+"""The CDF of the largest eigenvalue of a Wishart distribution.
+
+Compute the CDF of the Wishart distribution using the algorithm of
+Chiani (2014). The Wishart CDF is not defined in scipy.stats.wishart, nor in
+tensorflow_probability.distributions.wishart. At the time of writing, the only
+implementation of this CDF is the R package rootWishart. This largely
+follows that, but using scipy implementations rather than Boost, and omitting
+multi-precision support.
+
+For more info about the formulae, and the terse variable naming,
+please refer to the paper:
+
+"Distribution of the largest eigenvalue for real Wishart and Gaussian random
+matrices and a simple approximation for the Tracy-Widom distribution"
+
+found at: https://arxiv.org/pdf/1209.3394.pdf
+"""
+
+from __future__ import division
+
+import numpy as np
+from scipy.special import gamma, gammainc
+from scipy.linalg import det
+from scipy.constants import pi
+from cr.cube.util import lazyproperty
+
+try:
+    xrange
+except NameError:  # pragma: no cover
+    xrange = range
+
+
+class WishartCDF:
+    """Implementation of Cumulative Distribution Function (CDF)."""
+
+    def __init__(self, chisq, n_min, n_max):
+        self._chisq = chisq
+        self.n_min = n_min
+        self._n_max = n_max
+
+    @lazyproperty
+    def values(self):
+        """ndarray of wishart CDF values"""
+        return self.K * self.wishart_pfaffian
+
+    @lazyproperty
+    def wishart_pfaffian(self):
+        """ndarray of wishart pfaffian CDF, before normalization"""
+        return np.array(
+            [Pfaffian(self, val).value for i, val in np.ndenumerate(self._chisq)]
+        ).reshape(self._chisq.shape)
+
+    @lazyproperty
+    def alpha(self):
+        """int representing common basis of off-diagonal elements of A"""
+        return 0.5 * (self._n_max - self.n_min - 1)
+
+    @lazyproperty
+    def alpha_ind(self):
+        """indices of off-diagonal elements of A"""
+        return np.arange(self.n_min)
+
+    @lazyproperty
+    def alpha_last(self):
+        """elements of row and column to append to A when its order/size is odd"""
+        return (self._n_max + self.n_min + 1) / 2
+
+    @lazyproperty
+    def alpha_vec(self):
+        """elements of row and column to append to A when its order/size is odd"""
+        return self.alpha_ind + self.alpha + 1
+
+    @lazyproperty
+    def other_ind(self):
+        """last row or column of square A"""
+        return np.full(self.n_min, self.size - 1, dtype=np.int)
+
+    @staticmethod
+    def _mgamma(x, m):
+        mgamma = np.float_power(pi, 0.25 * m * (m - 1))
+        for i in xrange(m):
+            mgamma *= gamma(x - 0.5 * i)
+        return mgamma
+
+    @lazyproperty
+    def K(self):
+        """Normalizing constant for wishart CDF."""
+        K1 = np.float_power(pi, 0.5 * self.n_min * self.n_min)
+        K1 /= (
+            np.float_power(2, 0.5 * self.n_min * self._n_max)
+            * self._mgamma(0.5 * self._n_max, self.n_min)
+            * self._mgamma(0.5 * self.n_min, self.n_min)
+        )
+
+        K2 = np.float_power(
+            2, self.alpha * self.size + 0.5 * self.size * (self.size + 1)
+        )
+        for i in xrange(self.size):
+            K2 *= gamma(self.alpha + i + 1)
+        return K1 * K2
+
+    @lazyproperty
+    def size(self):
+        """int representing the size of the wishart matrix.
+
+        If the original size is even, that this is the size that's returned. If the
+        size is odd, then the next even number is returned, because of the row/col
+        insertion, that happens as the part of the algorithm
+        """
+        return self.n_min + (self.n_min % 2)
+
+
+class Pfaffian:
+    """Implementation of the Pfaffian value object class."""
+
+    def __init__(self, wishart_cdf, chisq_ordinal):
+        self._wishart_cdf = wishart_cdf
+        self._chisq_val = chisq_ordinal
+
+    @lazyproperty
+    def value(self):
+        """return float Cumulative Distribution Function.
+
+        The return value represents a floating point number of the CDF of the
+        largest eigenvalue of a Wishart(n, p) evaluated at chisq_val.
+        """
+        wishart = self._wishart_cdf
+
+        # Prepare variables for integration algorithm
+        A = self.A
+        p = self._gammainc_a
+        g = gamma(wishart.alpha_vec)
+        q_ind = np.arange(2 * wishart.n_min - 2)
+        q_vec = 2 * wishart.alpha + q_ind + 2
+        q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, self._chisq_val)
+
+        # Perform integration (i.e. calculate Pfaffian CDF)
+        for i in xrange(wishart.n_min):
+            # TODO consider index tricks instead of iteration here
+            b = 0.5 * p[i] * p[i]
+            for j in xrange(i, wishart.n_min - 1):
+                b -= q[i + j] / (g[i] * g[j + 1])
+                A[j + 1, i] = p[i] * p[j + 1] - 2 * b
+                A[i, j + 1] = -A[j + 1, i]
+        return np.sqrt(det(A))
+
+    @lazyproperty
+    def A(self):
+        """ndarray - a skew-symmetric matrix for integrating the target distribution"""
+        wishart = self._wishart_cdf
+
+        base = np.zeros([wishart.size, wishart.size])
+        if wishart.n_min % 2:
+            # If matrix has odd number of elements, we need to append a
+            # row and a col, in order for the pfaffian algorithm to work
+            base = self._make_size_even(base)
+        return base
+
+    @lazyproperty
+    def _gammainc_a(self):
+        return gammainc(self._wishart_cdf.alpha_vec, 0.5 * self._chisq_val)
+
+    def _make_size_even(self, base):
+        wishart = self._wishart_cdf
+        alpha_ind = wishart.alpha_ind
+        other_ind = wishart.other_ind
+        alpha_last = wishart.alpha_last
+
+        base[other_ind, alpha_ind] = (
+            np.float_power(2, -alpha_last) * self._gammainc_a / gamma(alpha_last)
+        )
+        base[alpha_ind, other_ind] = -base[other_ind, alpha_ind]
+
+        return base
diff --git a/src/cr/cube/measures/pairwise_pvalues.py b/src/cr/cube/measures/pairwise_pvalues.py
@@ -0,0 +1,94 @@
+# encoding: utf-8
+
+"""P-values of pairwise comparison or columns of a contingency table."""
+
+from __future__ import division
+
+import numpy as np
+
+# from cr.cube.distributions.wishart import wishartCDF
+from cr.cube.distributions.wishart import WishartCDF
+from cr.cube.util import lazyproperty
+
+try:
+    xrange
+except NameError:  # pragma: no cover
+    # pylint: disable=invalid-name
+    xrange = range
+
+
+# pylint: disable=too-few-public-methods
+class PairwisePvalues:
+    """Value object providing matrix of pairwise-comparison P-values"""
+
+    def __init__(self, slice_, axis=0, weighted=True):
+        self._slice = slice_
+        self._axis = axis
+        self._weighted = weighted
+
+    @lazyproperty
+    def values(self):
+        """Square matrix of pairwise Chi-square along axis, as numpy.ndarray."""
+        return 1.0 - WishartCDF(self._pairwise_chisq, self._n_min, self._n_max).values
+
+    @lazyproperty
+    def _categorical_pairwise_chisq(self):
+        """Pairwise comparisons (Chi-Square) along axis, as numpy.ndarray.
+
+        Returns a square, symmetric matrix of test statistics for the null
+        hypothesis that each vector along *axis* is equal to each other.
+        """
+        chisq = np.zeros([self._numel, self._numel])
+        for i in xrange(1, self._numel):
+            for j in xrange(0, self._numel - 1):
+                chisq[i, j] = chisq[j, i] = np.sum(
+                    np.square(self._proportions[:, i] - self._proportions[:, j])
+                    / self._observed
+                ) / (1 / self._margin[i] + 1 / self._margin[j])
+
+        return chisq
+
+    @lazyproperty
+    def _margin(self):
+        """Margin for the axis as numpy.ndarray."""
+        return self._slice.margin(axis=self._axis)
+
+    @lazyproperty
+    def _off_margin(self):
+        """Margin for the opposite axis as numpy.ndarray."""
+        return self._slice.margin(axis=(1 - self._axis))
+
+    @lazyproperty
+    def _pairwise_chisq(self):
+        """Pairwise Chi-squared statistics along axis, as numpy.ndarray.
+
+        Zscore is a measure of statistical significance of observed vs.
+        expected counts. It's only applicable to a 2D contingency tables.
+        """
+        return self._categorical_pairwise_chisq
+
+    @lazyproperty
+    def _proportions(self):
+        """Slice proportions for *axis* as numpy.ndarray."""
+        return self._slice.proportions(axis=self._axis)
+
+    @lazyproperty
+    def _n_max(self):
+        """Size (zero based) of the bigger of the two slice's dimension, as int."""
+        return max(self._slice.get_shape()) - 1
+
+    @lazyproperty
+    def _n_min(self):
+        """Size (zero based) of the smaller of the two slice's dimension, as int."""
+        return min(self._slice.get_shape()) - 1
+
+    @lazyproperty
+    def _numel(self):
+        """Number of elements of the dimension opposite to axis, as int."""
+        return self._slice.get_shape()[1 - self._axis]
+
+    @lazyproperty
+    def _observed(self):
+        """Observed marginal proportions, as float."""
+        total = self._slice.margin()
+        return self._off_margin / total