Skip to content

Commit

Permalink
Merge 05fadb2 into e75b7ef
Browse files Browse the repository at this point in the history
  • Loading branch information
slobodan-ilic committed Jan 18, 2019
2 parents e75b7ef + 05fadb2 commit 8b7e388
Show file tree
Hide file tree
Showing 11 changed files with 1,788 additions and 8 deletions.
11 changes: 11 additions & 0 deletions src/cr/cube/crunch_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,17 @@ def zscore(self, weighted=True, prune=False, hs_dims=None):
res = [s.zscore(weighted, prune, hs_dims) for s in self.slices]
return np.array(res) if self.ndim == 3 else res[0]

def pairwise_pvals(self, axis=0):
"""Return matrices of column-comparison p-values as list of numpy.ndarrays.
Square, symmetric matrix along *axis* of pairwise p-values for the
null hypothesis that col[i] = col[j] for each pair of columns.
*axis* (int): axis along which to perform comparison. Only columns (0)
are implemented currently.
"""
return [slice_.pairwise_pvals(axis=axis) for slice_ in self.slices]

def _adjust_axis(self, axis):
"""Return raw axis/axes corresponding to apparent axis/axes.
Expand Down
38 changes: 30 additions & 8 deletions src/cr/cube/cube_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from cr.cube.enum import DIMENSION_TYPE as DT
from cr.cube.measures.scale_means import ScaleMeans
from cr.cube.measures.pairwise_pvalues import PairwisePvalues
from cr.cube.util import compress_pruned, lazyproperty, memoize


Expand Down Expand Up @@ -306,12 +307,26 @@ def table_name(self):
table_name = self._cube.labels()[0][self._index]
return "%s: %s" % (title, table_name)

def pairwise_pvals(self, axis=0):
"""Return square symmetric matrix of pairwise column-comparison p-values.
Square, symmetric matrix along *axis* of pairwise p-values for the
null hypothesis that col[i] = col[j] for each pair of columns.
*axis* (int): axis along which to perform comparison. Only columns (0)
are implemented currently.
"""
if axis != 0:
raise NotImplementedError("Pairwise comparison only implemented for colums")
return PairwisePvalues(self, axis=axis).values

def pvals(self, weighted=True, prune=False, hs_dims=None):
"""Return 2D ndarray with calculated p-vals.
"""Return 2D ndarray with calculated P values
This function calculates statistically significant results for
categorical contingency tables. The values are calculated for 2D tables
only.
This function calculates statistically significant cells for
categorical contingency tables under the null hypothesis that the
row and column variables are independent (uncorrelated).
The values are calculated for 2D tables only.
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
Expand All @@ -325,15 +340,22 @@ def pvals(self, weighted=True, prune=False, hs_dims=None):
return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals

def zscore(self, weighted=True, prune=False, hs_dims=None):
"""Return ndarray with slices's zscore measurements.
"""Return ndarray with slices's standardized residuals (Z-scores).
(Only applicable to a 2D contingency tables.) The Z-score or
standardized residual is the difference between observed and expected
cell counts if row and column variables were independent divided
by the residual cell variance. They are assumed to come from a N(0,1)
or standard Normal distribution, and can show which cells deviate from
the null hypothesis that the row and column variables are uncorrelated.
Zscore is a measure of statistical signifficance of observed vs.
expected counts. It's only applicable to a 2D contingency tables.
See also *pairwise_chisq*, *pairwise_pvals* for a pairwise column-
or row-based test of statistical significance.
:param weighted: Use weighted counts for zscores
:param prune: Prune based on unweighted counts
:param hs_dims: Include headers and subtotals (as NaN values)
:returns zscore: ndarray representing zscore measurements
:returns zscore: ndarray representing cell standardized residuals (Z)
"""
counts = self.as_array(weighted=weighted)
total = self.margin(weighted=weighted)
Expand Down
Empty file.
177 changes: 177 additions & 0 deletions src/cr/cube/distributions/wishart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
# encoding: utf-8
# pylint: disable=too-few-public-methods, invalid-name

"""The CDF of the largest eigenvalue of a Wishart distribution.
Compute the CDF of the Wishart distribution using the algorithm of
Chiani (2014). The Wishart CDF is not defined in scipy.stats.wishart, nor in
tensorflow_probability.distributions.wishart. At the time of writing, the only
implementation of this CDF is the R package rootWishart. This largely
follows that, but using scipy implementations rather than Boost, and omitting
multi-precision support.
For more info about the formulae, and the terse variable naming,
please refer to the paper:
"Distribution of the largest eigenvalue for real Wishart and Gaussian random
matrices and a simple approximation for the Tracy-Widom distribution"
found at: https://arxiv.org/pdf/1209.3394.pdf
"""

from __future__ import division

import numpy as np
from scipy.special import gamma, gammainc
from scipy.linalg import det
from scipy.constants import pi
from cr.cube.util import lazyproperty

try:
xrange
except NameError: # pragma: no cover
xrange = range


class WishartCDF:
"""Implementation of Cumulative Distribution Function (CDF)."""

def __init__(self, chisq, n_min, n_max):
self._chisq = chisq
self.n_min = n_min
self._n_max = n_max

@lazyproperty
def values(self):
"""ndarray of wishart CDF values"""
return self.K * self.wishart_pfaffian

@lazyproperty
def wishart_pfaffian(self):
"""ndarray of wishart pfaffian CDF, before normalization"""
return np.array(
[Pfaffian(self, val).value for i, val in np.ndenumerate(self._chisq)]
).reshape(self._chisq.shape)

@lazyproperty
def alpha(self):
"""int representing common basis of off-diagonal elements of A"""
return 0.5 * (self._n_max - self.n_min - 1)

@lazyproperty
def alpha_ind(self):
"""indices of off-diagonal elements of A"""
return np.arange(self.n_min)

@lazyproperty
def alpha_last(self):
"""elements of row and column to append to A when its order/size is odd"""
return (self._n_max + self.n_min + 1) / 2

@lazyproperty
def alpha_vec(self):
"""elements of row and column to append to A when its order/size is odd"""
return self.alpha_ind + self.alpha + 1

@lazyproperty
def other_ind(self):
"""last row or column of square A"""
return np.full(self.n_min, self.size - 1, dtype=np.int)

@staticmethod
def _mgamma(x, m):
mgamma = np.float_power(pi, 0.25 * m * (m - 1))
for i in xrange(m):
mgamma *= gamma(x - 0.5 * i)
return mgamma

@lazyproperty
def K(self):
"""Normalizing constant for wishart CDF."""
K1 = np.float_power(pi, 0.5 * self.n_min * self.n_min)
K1 /= (
np.float_power(2, 0.5 * self.n_min * self._n_max)
* self._mgamma(0.5 * self._n_max, self.n_min)
* self._mgamma(0.5 * self.n_min, self.n_min)
)

K2 = np.float_power(
2, self.alpha * self.size + 0.5 * self.size * (self.size + 1)
)
for i in xrange(self.size):
K2 *= gamma(self.alpha + i + 1)
return K1 * K2

@lazyproperty
def size(self):
"""int representing the size of the wishart matrix.
If the original size is even, that this is the size that's returned. If the
size is odd, then the next even number is returned, because of the row/col
insertion, that happens as the part of the algorithm
"""
return self.n_min + (self.n_min % 2)


class Pfaffian:
"""Implementation of the Pfaffian value object class."""

def __init__(self, wishart_cdf, chisq_ordinal):
self._wishart_cdf = wishart_cdf
self._chisq_val = chisq_ordinal

@lazyproperty
def value(self):
"""return float Cumulative Distribution Function.
The return value represents a floating point number of the CDF of the
largest eigenvalue of a Wishart(n, p) evaluated at chisq_val.
"""
wishart = self._wishart_cdf

# Prepare variables for integration algorithm
A = self.A
p = self._gammainc_a
g = gamma(wishart.alpha_vec)
q_ind = np.arange(2 * wishart.n_min - 2)
q_vec = 2 * wishart.alpha + q_ind + 2
q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, self._chisq_val)

# Perform integration (i.e. calculate Pfaffian CDF)
for i in xrange(wishart.n_min):
# TODO consider index tricks instead of iteration here
b = 0.5 * p[i] * p[i]
for j in xrange(i, wishart.n_min - 1):
b -= q[i + j] / (g[i] * g[j + 1])
A[j + 1, i] = p[i] * p[j + 1] - 2 * b
A[i, j + 1] = -A[j + 1, i]
return np.sqrt(det(A))

@lazyproperty
def A(self):
"""ndarray - a skew-symmetric matrix for integrating the target distribution"""
wishart = self._wishart_cdf

base = np.zeros([wishart.size, wishart.size])
if wishart.n_min % 2:
# If matrix has odd number of elements, we need to append a
# row and a col, in order for the pfaffian algorithm to work
base = self._make_size_even(base)
return base

@lazyproperty
def _gammainc_a(self):
return gammainc(self._wishart_cdf.alpha_vec, 0.5 * self._chisq_val)

def _make_size_even(self, base):
wishart = self._wishart_cdf
alpha_ind = wishart.alpha_ind
other_ind = wishart.other_ind
alpha_last = wishart.alpha_last

base[other_ind, alpha_ind] = (
np.float_power(2, -alpha_last) * self._gammainc_a / gamma(alpha_last)
)
base[alpha_ind, other_ind] = -base[other_ind, alpha_ind]

return base
94 changes: 94 additions & 0 deletions src/cr/cube/measures/pairwise_pvalues.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# encoding: utf-8

"""P-values of pairwise comparison or columns of a contingency table."""

from __future__ import division

import numpy as np

# from cr.cube.distributions.wishart import wishartCDF
from cr.cube.distributions.wishart import WishartCDF
from cr.cube.util import lazyproperty

try:
xrange
except NameError: # pragma: no cover
# pylint: disable=invalid-name
xrange = range


# pylint: disable=too-few-public-methods
class PairwisePvalues:
"""Value object providing matrix of pairwise-comparison P-values"""

def __init__(self, slice_, axis=0, weighted=True):
self._slice = slice_
self._axis = axis
self._weighted = weighted

@lazyproperty
def values(self):
"""Square matrix of pairwise Chi-square along axis, as numpy.ndarray."""
return 1.0 - WishartCDF(self._pairwise_chisq, self._n_min, self._n_max).values

@lazyproperty
def _categorical_pairwise_chisq(self):
"""Pairwise comparisons (Chi-Square) along axis, as numpy.ndarray.
Returns a square, symmetric matrix of test statistics for the null
hypothesis that each vector along *axis* is equal to each other.
"""
chisq = np.zeros([self._numel, self._numel])
for i in xrange(1, self._numel):
for j in xrange(0, self._numel - 1):
chisq[i, j] = chisq[j, i] = np.sum(
np.square(self._proportions[:, i] - self._proportions[:, j])
/ self._observed
) / (1 / self._margin[i] + 1 / self._margin[j])

return chisq

@lazyproperty
def _margin(self):
"""Margin for the axis as numpy.ndarray."""
return self._slice.margin(axis=self._axis)

@lazyproperty
def _off_margin(self):
"""Margin for the opposite axis as numpy.ndarray."""
return self._slice.margin(axis=(1 - self._axis))

@lazyproperty
def _pairwise_chisq(self):
"""Pairwise Chi-squared statistics along axis, as numpy.ndarray.
Zscore is a measure of statistical significance of observed vs.
expected counts. It's only applicable to a 2D contingency tables.
"""
return self._categorical_pairwise_chisq

@lazyproperty
def _proportions(self):
"""Slice proportions for *axis* as numpy.ndarray."""
return self._slice.proportions(axis=self._axis)

@lazyproperty
def _n_max(self):
"""Size (zero based) of the bigger of the two slice's dimension, as int."""
return max(self._slice.get_shape()) - 1

@lazyproperty
def _n_min(self):
"""Size (zero based) of the smaller of the two slice's dimension, as int."""
return min(self._slice.get_shape()) - 1

@lazyproperty
def _numel(self):
"""Number of elements of the dimension opposite to axis, as int."""
return self._slice.get_shape()[1 - self._axis]

@lazyproperty
def _observed(self):
"""Observed marginal proportions, as float."""
total = self._slice.margin()
return self._off_margin / total
Loading

0 comments on commit 8b7e388

Please sign in to comment.