diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index e677521d2..3f057ce04 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -713,6 +713,17 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): res = [s.zscore(weighted, prune, hs_dims) for s in self.slices] return np.array(res) if self.ndim == 3 else res[0] + def pairwise_pvals(self, axis=0): + """Return matrices of column-comparison p-values as list of numpy.ndarrays. + + Square, symmetric matrix along *axis* of pairwise p-values for the + null hypothesis that col[i] = col[j] for each pair of columns. + + *axis* (int): axis along which to perform comparison. Only columns (0) + are implemented currently. + """ + return [slice_.pairwise_pvals(axis=axis) for slice_ in self.slices] + def _adjust_axis(self, axis): """Return raw axis/axes corresponding to apparent axis/axes. diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 9857de99e..8751949da 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -14,6 +14,7 @@ from cr.cube.enum import DIMENSION_TYPE as DT from cr.cube.measures.scale_means import ScaleMeans +from cr.cube.measures.pairwise_pvalues import PairwisePvalues from cr.cube.util import compress_pruned, lazyproperty, memoize @@ -306,12 +307,26 @@ def table_name(self): table_name = self._cube.labels()[0][self._index] return "%s: %s" % (title, table_name) + def pairwise_pvals(self, axis=0): + """Return square symmetric matrix of pairwise column-comparison p-values. + + Square, symmetric matrix along *axis* of pairwise p-values for the + null hypothesis that col[i] = col[j] for each pair of columns. + + *axis* (int): axis along which to perform comparison. Only columns (0) + are implemented currently. + """ + if axis != 0: + raise NotImplementedError("Pairwise comparison only implemented for colums") + return PairwisePvalues(self, axis=axis).values + def pvals(self, weighted=True, prune=False, hs_dims=None): - """Return 2D ndarray with calculated p-vals. + """Return 2D ndarray with calculated P values - This function calculates statistically significant results for - categorical contingency tables. The values are calculated for 2D tables - only. + This function calculates statistically significant cells for + categorical contingency tables under the null hypothesis that the + row and column variables are independent (uncorrelated). + The values are calculated for 2D tables only. :param weighted: Use weighted counts for zscores :param prune: Prune based on unweighted counts @@ -325,15 +340,22 @@ def pvals(self, weighted=True, prune=False, hs_dims=None): return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals def zscore(self, weighted=True, prune=False, hs_dims=None): - """Return ndarray with slices's zscore measurements. + """Return ndarray with slices's standardized residuals (Z-scores). + + (Only applicable to a 2D contingency tables.) The Z-score or + standardized residual is the difference between observed and expected + cell counts if row and column variables were independent divided + by the residual cell variance. They are assumed to come from a N(0,1) + or standard Normal distribution, and can show which cells deviate from + the null hypothesis that the row and column variables are uncorrelated. - Zscore is a measure of statistical signifficance of observed vs. - expected counts. It's only applicable to a 2D contingency tables. + See also *pairwise_chisq*, *pairwise_pvals* for a pairwise column- + or row-based test of statistical significance. :param weighted: Use weighted counts for zscores :param prune: Prune based on unweighted counts :param hs_dims: Include headers and subtotals (as NaN values) - :returns zscore: ndarray representing zscore measurements + :returns zscore: ndarray representing cell standardized residuals (Z) """ counts = self.as_array(weighted=weighted) total = self.margin(weighted=weighted) diff --git a/src/cr/cube/distributions/__init__.py b/src/cr/cube/distributions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py new file mode 100644 index 000000000..a385df3b0 --- /dev/null +++ b/src/cr/cube/distributions/wishart.py @@ -0,0 +1,177 @@ +# encoding: utf-8 +# pylint: disable=too-few-public-methods, invalid-name + +"""The CDF of the largest eigenvalue of a Wishart distribution. + +Compute the CDF of the Wishart distribution using the algorithm of +Chiani (2014). The Wishart CDF is not defined in scipy.stats.wishart, nor in +tensorflow_probability.distributions.wishart. At the time of writing, the only +implementation of this CDF is the R package rootWishart. This largely +follows that, but using scipy implementations rather than Boost, and omitting +multi-precision support. + +For more info about the formulae, and the terse variable naming, +please refer to the paper: + +"Distribution of the largest eigenvalue for real Wishart and Gaussian random +matrices and a simple approximation for the Tracy-Widom distribution" + +found at: https://arxiv.org/pdf/1209.3394.pdf +""" + +from __future__ import division + +import numpy as np +from scipy.special import gamma, gammainc +from scipy.linalg import det +from scipy.constants import pi +from cr.cube.util import lazyproperty + +try: + xrange +except NameError: # pragma: no cover + xrange = range + + +class WishartCDF: + """Implementation of Cumulative Distribution Function (CDF).""" + + def __init__(self, chisq, n_min, n_max): + self._chisq = chisq + self.n_min = n_min + self._n_max = n_max + + @lazyproperty + def values(self): + """ndarray of wishart CDF values""" + return self.K * self.wishart_pfaffian + + @lazyproperty + def wishart_pfaffian(self): + """ndarray of wishart pfaffian CDF, before normalization""" + return np.array( + [Pfaffian(self, val).value for i, val in np.ndenumerate(self._chisq)] + ).reshape(self._chisq.shape) + + @lazyproperty + def alpha(self): + """int representing common basis of off-diagonal elements of A""" + return 0.5 * (self._n_max - self.n_min - 1) + + @lazyproperty + def alpha_ind(self): + """indices of off-diagonal elements of A""" + return np.arange(self.n_min) + + @lazyproperty + def alpha_last(self): + """elements of row and column to append to A when its order/size is odd""" + return (self._n_max + self.n_min + 1) / 2 + + @lazyproperty + def alpha_vec(self): + """elements of row and column to append to A when its order/size is odd""" + return self.alpha_ind + self.alpha + 1 + + @lazyproperty + def other_ind(self): + """last row or column of square A""" + return np.full(self.n_min, self.size - 1, dtype=np.int) + + @staticmethod + def _mgamma(x, m): + mgamma = np.float_power(pi, 0.25 * m * (m - 1)) + for i in xrange(m): + mgamma *= gamma(x - 0.5 * i) + return mgamma + + @lazyproperty + def K(self): + """Normalizing constant for wishart CDF.""" + K1 = np.float_power(pi, 0.5 * self.n_min * self.n_min) + K1 /= ( + np.float_power(2, 0.5 * self.n_min * self._n_max) + * self._mgamma(0.5 * self._n_max, self.n_min) + * self._mgamma(0.5 * self.n_min, self.n_min) + ) + + K2 = np.float_power( + 2, self.alpha * self.size + 0.5 * self.size * (self.size + 1) + ) + for i in xrange(self.size): + K2 *= gamma(self.alpha + i + 1) + return K1 * K2 + + @lazyproperty + def size(self): + """int representing the size of the wishart matrix. + + If the original size is even, that this is the size that's returned. If the + size is odd, then the next even number is returned, because of the row/col + insertion, that happens as the part of the algorithm + """ + return self.n_min + (self.n_min % 2) + + +class Pfaffian: + """Implementation of the Pfaffian value object class.""" + + def __init__(self, wishart_cdf, chisq_ordinal): + self._wishart_cdf = wishart_cdf + self._chisq_val = chisq_ordinal + + @lazyproperty + def value(self): + """return float Cumulative Distribution Function. + + The return value represents a floating point number of the CDF of the + largest eigenvalue of a Wishart(n, p) evaluated at chisq_val. + """ + wishart = self._wishart_cdf + + # Prepare variables for integration algorithm + A = self.A + p = self._gammainc_a + g = gamma(wishart.alpha_vec) + q_ind = np.arange(2 * wishart.n_min - 2) + q_vec = 2 * wishart.alpha + q_ind + 2 + q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, self._chisq_val) + + # Perform integration (i.e. calculate Pfaffian CDF) + for i in xrange(wishart.n_min): + # TODO consider index tricks instead of iteration here + b = 0.5 * p[i] * p[i] + for j in xrange(i, wishart.n_min - 1): + b -= q[i + j] / (g[i] * g[j + 1]) + A[j + 1, i] = p[i] * p[j + 1] - 2 * b + A[i, j + 1] = -A[j + 1, i] + return np.sqrt(det(A)) + + @lazyproperty + def A(self): + """ndarray - a skew-symmetric matrix for integrating the target distribution""" + wishart = self._wishart_cdf + + base = np.zeros([wishart.size, wishart.size]) + if wishart.n_min % 2: + # If matrix has odd number of elements, we need to append a + # row and a col, in order for the pfaffian algorithm to work + base = self._make_size_even(base) + return base + + @lazyproperty + def _gammainc_a(self): + return gammainc(self._wishart_cdf.alpha_vec, 0.5 * self._chisq_val) + + def _make_size_even(self, base): + wishart = self._wishart_cdf + alpha_ind = wishart.alpha_ind + other_ind = wishart.other_ind + alpha_last = wishart.alpha_last + + base[other_ind, alpha_ind] = ( + np.float_power(2, -alpha_last) * self._gammainc_a / gamma(alpha_last) + ) + base[alpha_ind, other_ind] = -base[other_ind, alpha_ind] + + return base diff --git a/src/cr/cube/measures/pairwise_pvalues.py b/src/cr/cube/measures/pairwise_pvalues.py new file mode 100644 index 000000000..de1e19b1f --- /dev/null +++ b/src/cr/cube/measures/pairwise_pvalues.py @@ -0,0 +1,94 @@ +# encoding: utf-8 + +"""P-values of pairwise comparison or columns of a contingency table.""" + +from __future__ import division + +import numpy as np + +# from cr.cube.distributions.wishart import wishartCDF +from cr.cube.distributions.wishart import WishartCDF +from cr.cube.util import lazyproperty + +try: + xrange +except NameError: # pragma: no cover + # pylint: disable=invalid-name + xrange = range + + +# pylint: disable=too-few-public-methods +class PairwisePvalues: + """Value object providing matrix of pairwise-comparison P-values""" + + def __init__(self, slice_, axis=0, weighted=True): + self._slice = slice_ + self._axis = axis + self._weighted = weighted + + @lazyproperty + def values(self): + """Square matrix of pairwise Chi-square along axis, as numpy.ndarray.""" + return 1.0 - WishartCDF(self._pairwise_chisq, self._n_min, self._n_max).values + + @lazyproperty + def _categorical_pairwise_chisq(self): + """Pairwise comparisons (Chi-Square) along axis, as numpy.ndarray. + + Returns a square, symmetric matrix of test statistics for the null + hypothesis that each vector along *axis* is equal to each other. + """ + chisq = np.zeros([self._numel, self._numel]) + for i in xrange(1, self._numel): + for j in xrange(0, self._numel - 1): + chisq[i, j] = chisq[j, i] = np.sum( + np.square(self._proportions[:, i] - self._proportions[:, j]) + / self._observed + ) / (1 / self._margin[i] + 1 / self._margin[j]) + + return chisq + + @lazyproperty + def _margin(self): + """Margin for the axis as numpy.ndarray.""" + return self._slice.margin(axis=self._axis) + + @lazyproperty + def _off_margin(self): + """Margin for the opposite axis as numpy.ndarray.""" + return self._slice.margin(axis=(1 - self._axis)) + + @lazyproperty + def _pairwise_chisq(self): + """Pairwise Chi-squared statistics along axis, as numpy.ndarray. + + Zscore is a measure of statistical significance of observed vs. + expected counts. It's only applicable to a 2D contingency tables. + """ + return self._categorical_pairwise_chisq + + @lazyproperty + def _proportions(self): + """Slice proportions for *axis* as numpy.ndarray.""" + return self._slice.proportions(axis=self._axis) + + @lazyproperty + def _n_max(self): + """Size (zero based) of the bigger of the two slice's dimension, as int.""" + return max(self._slice.get_shape()) - 1 + + @lazyproperty + def _n_min(self): + """Size (zero based) of the smaller of the two slice's dimension, as int.""" + return min(self._slice.get_shape()) - 1 + + @lazyproperty + def _numel(self): + """Number of elements of the dimension opposite to axis, as int.""" + return self._slice.get_shape()[1 - self._axis] + + @lazyproperty + def _observed(self): + """Observed marginal proportions, as float.""" + total = self._slice.margin() + return self._off_margin / total diff --git a/tests/fixtures/pairwise-4x4.json b/tests/fixtures/pairwise-4x4.json new file mode 100644 index 000000000..9f5ac2bef --- /dev/null +++ b/tests/fixtures/pairwise-4x4.json @@ -0,0 +1,238 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/4bb830/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000001%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F066c45%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/4bb830/variables/000001/" + }, + { + "variable": "/api/datasets/4bb830/variables/066c45/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "J", + "name": "J" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "A", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "B", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "C", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "D", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "references": { + "alias": "K", + "name": "K" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "A", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "B", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "C", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "D", + "missing": false + }, + { + "numeric_value": 5, + "id": 5, + "name": "E", + "missing": true + }, + { + "numeric_value": 6, + "id": 6, + "name": "F", + "missing": true + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 14, + 14, + 13, + 16, + 0, + 0, + 0, + 22, + 14, + 19, + 19, + 0, + 0, + 0, + 14, + 6, + 19, + 8, + 0, + 0, + 0, + 17, + 9, + 28, + 11, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 243, + "unfiltered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "filtered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "counts": [ + 14, + 14, + 13, + 16, + 0, + 0, + 0, + 22, + 14, + 19, + 19, + 0, + 0, + 0, + 14, + 6, + 19, + 8, + 0, + 0, + 0, + 17, + 9, + 28, + 11, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json b/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json new file mode 100644 index 000000000..e1b641e0a --- /dev/null +++ b/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json @@ -0,0 +1,274 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9271a0/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9271a0/variables/000000/" + }, + { + "variable": "/api/datasets/9271a0/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "illness", + "name": "illness" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 3, + "id": 3, + "name": "slight", + "missing": false + }, + { + "numeric_value": 1, + "id": 1, + "name": "medium", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "serious", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "derived": false, + "references": { + "alias": "occupation", + "name": "occupation" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "1" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "2" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "3" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "4" + }, + { + "numeric_value": 5, + "missing": false, + "id": 5, + "name": "5" + }, + { + "numeric_value": 6, + "missing": false, + "id": 6, + "name": "6" + }, + { + "numeric_value": 7, + "missing": false, + "id": 7, + "name": "7" + }, + { + "numeric_value": 8, + "missing": false, + "id": 8, + "name": "8" + }, + { + "numeric_value": 9, + "missing": false, + "id": 9, + "name": "9" + }, + { + "numeric_value": 10, + "missing": false, + "id": 10, + "name": "10" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 148, + 111, + 645, + 165, + 383, + 96, + 98, + 199, + 59, + 262, + 0, + 444, + 352, + 1911, + 771, + 1829, + 293, + 330, + 874, + 199, + 1320, + 0, + 86, + 49, + 328, + 119, + 311, + 47, + 58, + 155, + 30, + 236, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 11908, + "unfiltered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "filtered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "counts": [ + 148, + 111, + 645, + 165, + 383, + 96, + 98, + 199, + 59, + 262, + 0, + 444, + 352, + 1911, + 771, + 1829, + 293, + 330, + 874, + 199, + 1320, + 0, + 86, + 49, + 328, + 119, + 311, + 47, + 58, + 155, + 30, + 236, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json b/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json new file mode 100644 index 000000000..26dcc2614 --- /dev/null +++ b/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json @@ -0,0 +1,274 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9271a0/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000001%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000000%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9271a0/variables/000001/" + }, + { + "variable": "/api/datasets/9271a0/variables/000000/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "derived": false, + "references": { + "alias": "occupation", + "name": "occupation" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "1" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "2" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "3" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "4" + }, + { + "numeric_value": 5, + "missing": false, + "id": 5, + "name": "5" + }, + { + "numeric_value": 6, + "missing": false, + "id": 6, + "name": "6" + }, + { + "numeric_value": 7, + "missing": false, + "id": 7, + "name": "7" + }, + { + "numeric_value": 8, + "missing": false, + "id": 8, + "name": "8" + }, + { + "numeric_value": 9, + "missing": false, + "id": 9, + "name": "9" + }, + { + "numeric_value": 10, + "missing": false, + "id": 10, + "name": "10" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + }, + { + "references": { + "alias": "illness", + "name": "illness" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 3, + "id": 3, + "name": "slight", + "missing": false + }, + { + "numeric_value": 1, + "id": 1, + "name": "medium", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "serious", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 148, + 444, + 86, + 0, + 111, + 352, + 49, + 0, + 645, + 1911, + 328, + 0, + 165, + 771, + 119, + 0, + 383, + 1829, + 311, + 0, + 96, + 293, + 47, + 0, + 98, + 330, + 58, + 0, + 199, + 874, + 155, + 0, + 59, + 199, + 30, + 0, + 262, + 1320, + 236, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 11908, + "unfiltered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "filtered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "counts": [ + 148, + 444, + 86, + 0, + 111, + 352, + 49, + 0, + 645, + 1911, + 328, + 0, + 165, + 771, + 119, + 0, + 383, + 1829, + 311, + 0, + 96, + 293, + 47, + 0, + 98, + 330, + 58, + 0, + 199, + 874, + 155, + 0, + 59, + 199, + 30, + 0, + 262, + 1320, + 236, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/pairwise-random-3x4.json b/tests/fixtures/pairwise-random-3x4.json new file mode 100644 index 000000000..f16010686 --- /dev/null +++ b/tests/fixtures/pairwise-random-3x4.json @@ -0,0 +1,190 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/4bb830/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/4bb830/variables/000000/" + }, + { + "variable": "/api/datasets/4bb830/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "derived": false, + "references": { + "alias": "I", + "name": "I" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "one" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "two" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "three" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + }, + { + "derived": false, + "references": { + "alias": "J", + "name": "J" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "A" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "B" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "C" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "D" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 12, + 23, + 10, + 20, + 0, + 23, + 28, + 14, + 24, + 0, + 22, + 23, + 23, + 21, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 243, + "unfiltered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "filtered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "counts": [ + 12, + 23, + 10, + 20, + 0, + 23, + 28, + 14, + 24, + 0, + 22, + 23, + 23, + 21, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/same-counts-3x4.json b/tests/fixtures/same-counts-3x4.json new file mode 100644 index 000000000..8219bcda9 --- /dev/null +++ b/tests/fixtures/same-counts-3x4.json @@ -0,0 +1,190 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9c9001/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9c9001%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9c9001%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9c9001/variables/000000/" + }, + { + "variable": "/api/datasets/9c9001/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "Var1", + "name": "Var1" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "1", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "2", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "3", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "references": { + "alias": "Var2", + "name": "Var2" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "a", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "b", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "c", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "d", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 10, + 10, + 10, + 10, + 0, + 20, + 20, + 20, + 20, + 0, + 30, + 30, + 30, + 30, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 240, + "unfiltered": { + "unweighted_n": 240, + "weighted_n": 240 + }, + "filtered": { + "unweighted_n": 240, + "weighted_n": 240 + }, + "counts": [ + 10, + 10, + 10, + 10, + 0, + 20, + 20, + 20, + 20, + 0, + 30, + 30, + 30, + 30, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py new file mode 100644 index 000000000..a8babfed7 --- /dev/null +++ b/tests/integration/test_pairwise.py @@ -0,0 +1,310 @@ +# encoding: utf-8 +# pylint: disable=protected-access + +"""Integration tests for pairwise comparisons.""" + +from unittest import TestCase +import pytest + +import numpy as np + +from cr.cube.crunch_cube import CrunchCube +from cr.cube.measures.pairwise_pvalues import PairwisePvalues + +from ..fixtures import CR + + +# pylint: disable=missing-docstring, invalid-name, no-self-use +class TestStandardizedResiduals(TestCase): + """Test cr.cube implementation of column family pairwise comparisons""" + + def test_same_col_counts(self): + """Test statistics for columns that are all the same.""" + cube = CrunchCube(CR.SAME_COUNTS_3x4) + pairwise_pvalues = PairwisePvalues(cube.slices[0], axis=0) + expected = np.zeros([4, 4]) + actual = pairwise_pvalues._pairwise_chisq + np.testing.assert_equal(actual, expected) + + def test_hirotsu_chisq(self): + """Test statistic for hirotsu data matches R""" + cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) + pairwise_pvalues = PairwisePvalues(cube.slices[0], axis=0) + expected = np.array( + [ + [ + 0.0, + 2.821910158116655, + 0.9259711818781733, + 12.780855448128131, + 16.79727869630099, + 0.924655442873681, + 0.8008976269312448, + 9.616972398702428, + 1.4496863124510315, + 18.556098937181705, + ], + [ + 2.821910158116655, + 0.0, + 1.6831132737959318, + 8.683471852181562, + 13.451053159265136, + 0.38467827774871005, + 1.5094961530071807, + 9.081312924348003, + 0.25833985406056126, + 16.3533306337074, + ], + [ + 0.9259711818781733, + 1.6831132737959318, + 0.0, + 24.348935423464653, + 46.689386077899826, + 0.18470822825752797, + 1.376598707986204, + 22.063658540387774, + 1.0102118795109807, + 47.62124004565971, + ], + [ + 12.780855448128131, + 8.683471852181562, + 24.348935423464653, + 0.0, + 0.8073979083263744, + 8.490641259215641, + 5.141740694105387, + 1.2536004848874829, + 3.576241745092247, + 2.1974561987876613, + ], + [ + 16.79727869630099, + 13.451053159265136, + 46.689386077899826, + 0.8073979083263744, + 0.0, + 11.792012011326468, + 6.847609367845222, + 0.743555569450378, + 5.218390456727495, + 0.725476017865348, + ], + [ + 0.924655442873681, + 0.38467827774871005, + 0.18470822825752797, + 8.490641259215641, + 11.792012011326468, + 0.0, + 0.7072537831958036, + 7.620018353425002, + 0.3321969685319031, + 14.087591553810693, + ], + [ + 0.8008976269312448, + 1.5094961530071807, + 1.376598707986204, + 5.141740694105387, + 6.847609367845222, + 0.7072537831958036, + 0.0, + 3.6724354409467352, + 0.39674326208673527, + 8.546159019524978, + ], + [ + 9.616972398702428, + 9.081312924348003, + 22.063658540387774, + 1.2536004848874829, + 0.743555569450378, + 7.620018353425002, + 3.6724354409467352, + 0.0, + 3.4464292421171003, + 1.5916695633869193, + ], + [ + 1.4496863124510315, + 0.25833985406056126, + 1.0102118795109807, + 3.576241745092247, + 5.218390456727495, + 0.3321969685319031, + 0.39674326208673527, + 3.4464292421171003, + 0.0, + 6.85424450468994, + ], + [ + 18.556098937181705, + 16.3533306337074, + 47.62124004565971, + 2.1974561987876613, + 0.725476017865348, + 14.087591553810693, + 8.546159019524978, + 1.5916695633869193, + 6.85424450468994, + 0.0, + ], + ] + ) + actual = pairwise_pvalues._pairwise_chisq + np.testing.assert_almost_equal(actual, expected) + + def test_same_col_pvals(self): + """P-values for columns that are all the same.""" + cube = CrunchCube(CR.SAME_COUNTS_3x4) + expected = [np.ones([4, 4])] + actual = cube.pairwise_pvals(axis=0) + np.testing.assert_equal(actual, expected) + + # Assert correct exception in case of not-implemented direction + with pytest.raises(NotImplementedError): + cube.pairwise_pvals(axis=1) + + def test_hirotsu_pvals(self): + cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) + actual = cube.pairwise_pvals(axis=0) + expected = [ + np.array( + [ + 1, + 0.999603716443816, + 0.99999993076784, + 0.435830186989942, + 0.171365670494448, + 0.999999931581745, + 0.999999979427862, + 0.726740806122402, + 0.999997338047414, + 0.105707739899106, + 0.999603716443816, + 1, + 0.999991395396033, + 0.806407150042716, + 0.380296648898666, + 0.999999999961875, + 0.999996333717649, + 0.773583582093158, + 0.999999999998836, + 0.192375246184738, + 0.99999993076784, + 0.999991395396033, + 1, + 0.017277623171216, + 3.29012189337341e-06, + 0.99999999999994, + 0.999998237045896, + 0.0365273119329589, + 0.999999857555538, + 2.23456306602809e-06, + 0.435830186989942, + 0.806407150042716, + 0.017277623171216, + 1, + 0.999999977981595, + 0.821586701043061, + 0.982573114952466, + 0.999999169027016, + 0.998041030837588, + 0.999934687968906, + 0.171365670494448, + 0.380296648898666, + 3.29012189337341e-06, + 0.999999977981595, + 1, + 0.52406354520284, + 0.926322806048378, + 0.99999998900118, + 0.981100354607917, + 0.999999991067971, + 0.999999931581745, + 0.999999999961875, + 0.99999999999994, + 0.821586701043061, + 0.52406354520284, + 1, + 0.999999992799126, + 0.883025655503086, + 0.99999999998941, + 0.33149560264078, + 0.999999979427862, + 0.999996333717649, + 0.999998237045896, + 0.982573114952466, + 0.926322806048378, + 0.999999992799126, + 1, + 0.997674862917282, + 0.99999999995011, + 0.81726901111794, + 0.726740806122402, + 0.773583582093158, + 0.0365273119329589, + 0.999999169027016, + 0.99999998900118, + 0.883025655503086, + 0.997674862917282, + 1, + 0.998461227115608, + 0.999994436499243, + 0.999997338047414, + 0.999999999998836, + 0.999999857555538, + 0.998041030837588, + 0.981100354607917, + 0.99999999998941, + 0.99999999995011, + 0.998461227115608, + 1, + 0.925999125959122, + 0.105707739899106, + 0.192375246184738, + 2.23456306602809e-06, + 0.999934687968906, + 0.999999991067971, + 0.33149560264078, + 0.81726901111794, + 0.999994436499243, + 0.925999125959122, + 1, + ] + ).reshape(10, 10) + ] + np.testing.assert_almost_equal(actual, expected) + + def test_odd_latent_dimensions(self): + """Test code path for the pfaffian of an odd-dimensioned matrix + Latent matrix size is (n_min - 1) so 3 for a 4x4 table. + """ + cube = CrunchCube(CR.PAIRWISE_4X4) + actual = cube.pairwise_pvals(axis=0) + expected = [ + np.array( + [ + 1, + 0.949690252544668, + 0.917559534718816, + 0.97630069244232, + 0.949690252544668, + 1, + 0.35511045473507, + 0.999999119644564, + 0.917559534718816, + 0.35511045473507, + 1, + 0.313869429972919, + 0.97630069244232, + 0.999999119644564, + 0.313869429972919, + 1, + ] + ).reshape(4, 4) + ] + np.testing.assert_almost_equal(actual, expected)