From 0ad760bf6e8197175ed052ba0d855c5c005dd0db Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Fri, 4 Jan 2019 09:44:00 -0500 Subject: [PATCH 01/12] fixtures and test stub --- ...pairwise-hirotsu-illness-x-occupation.json | 274 ++++++++++++++++++ ...pairwise-hirotsu-occupation-x-illness.json | 274 ++++++++++++++++++ tests/fixtures/pairwise-random-3x4.json | 190 ++++++++++++ tests/fixtures/same-counts-3x4.json | 190 ++++++++++++ tests/integration/test_pairwise.py | 26 ++ 5 files changed, 954 insertions(+) create mode 100644 tests/fixtures/pairwise-hirotsu-illness-x-occupation.json create mode 100644 tests/fixtures/pairwise-hirotsu-occupation-x-illness.json create mode 100644 tests/fixtures/pairwise-random-3x4.json create mode 100644 tests/fixtures/same-counts-3x4.json create mode 100644 tests/integration/test_pairwise.py diff --git a/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json b/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json new file mode 100644 index 000000000..e1b641e0a --- /dev/null +++ b/tests/fixtures/pairwise-hirotsu-illness-x-occupation.json @@ -0,0 +1,274 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9271a0/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9271a0/variables/000000/" + }, + { + "variable": "/api/datasets/9271a0/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "illness", + "name": "illness" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 3, + "id": 3, + "name": "slight", + "missing": false + }, + { + "numeric_value": 1, + "id": 1, + "name": "medium", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "serious", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "derived": false, + "references": { + "alias": "occupation", + "name": "occupation" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "1" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "2" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "3" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "4" + }, + { + "numeric_value": 5, + "missing": false, + "id": 5, + "name": "5" + }, + { + "numeric_value": 6, + "missing": false, + "id": 6, + "name": "6" + }, + { + "numeric_value": 7, + "missing": false, + "id": 7, + "name": "7" + }, + { + "numeric_value": 8, + "missing": false, + "id": 8, + "name": "8" + }, + { + "numeric_value": 9, + "missing": false, + "id": 9, + "name": "9" + }, + { + "numeric_value": 10, + "missing": false, + "id": 10, + "name": "10" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 148, + 111, + 645, + 165, + 383, + 96, + 98, + 199, + 59, + 262, + 0, + 444, + 352, + 1911, + 771, + 1829, + 293, + 330, + 874, + 199, + 1320, + 0, + 86, + 49, + 328, + 119, + 311, + 47, + 58, + 155, + 30, + 236, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 11908, + "unfiltered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "filtered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "counts": [ + 148, + 111, + 645, + 165, + 383, + 96, + 98, + 199, + 59, + 262, + 0, + 444, + 352, + 1911, + 771, + 1829, + 293, + 330, + 874, + 199, + 1320, + 0, + 86, + 49, + 328, + 119, + 311, + 47, + 58, + 155, + 30, + 236, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json b/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json new file mode 100644 index 000000000..26dcc2614 --- /dev/null +++ b/tests/fixtures/pairwise-hirotsu-occupation-x-illness.json @@ -0,0 +1,274 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9271a0/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000001%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9271a0%2Fvariables%2F000000%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9271a0/variables/000001/" + }, + { + "variable": "/api/datasets/9271a0/variables/000000/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "derived": false, + "references": { + "alias": "occupation", + "name": "occupation" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "1" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "2" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "3" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "4" + }, + { + "numeric_value": 5, + "missing": false, + "id": 5, + "name": "5" + }, + { + "numeric_value": 6, + "missing": false, + "id": 6, + "name": "6" + }, + { + "numeric_value": 7, + "missing": false, + "id": 7, + "name": "7" + }, + { + "numeric_value": 8, + "missing": false, + "id": 8, + "name": "8" + }, + { + "numeric_value": 9, + "missing": false, + "id": 9, + "name": "9" + }, + { + "numeric_value": 10, + "missing": false, + "id": 10, + "name": "10" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + }, + { + "references": { + "alias": "illness", + "name": "illness" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 3, + "id": 3, + "name": "slight", + "missing": false + }, + { + "numeric_value": 1, + "id": 1, + "name": "medium", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "serious", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 148, + 444, + 86, + 0, + 111, + 352, + 49, + 0, + 645, + 1911, + 328, + 0, + 165, + 771, + 119, + 0, + 383, + 1829, + 311, + 0, + 96, + 293, + 47, + 0, + 98, + 330, + 58, + 0, + 199, + 874, + 155, + 0, + 59, + 199, + 30, + 0, + 262, + 1320, + 236, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 11908, + "unfiltered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "filtered": { + "unweighted_n": 11908, + "weighted_n": 11908 + }, + "counts": [ + 148, + 444, + 86, + 0, + 111, + 352, + 49, + 0, + 645, + 1911, + 328, + 0, + 165, + 771, + 119, + 0, + 383, + 1829, + 311, + 0, + 96, + 293, + 47, + 0, + 98, + 330, + 58, + 0, + 199, + 874, + 155, + 0, + 59, + 199, + 30, + 0, + 262, + 1320, + 236, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/pairwise-random-3x4.json b/tests/fixtures/pairwise-random-3x4.json new file mode 100644 index 000000000..f16010686 --- /dev/null +++ b/tests/fixtures/pairwise-random-3x4.json @@ -0,0 +1,190 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/4bb830/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/4bb830/variables/000000/" + }, + { + "variable": "/api/datasets/4bb830/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "derived": false, + "references": { + "alias": "I", + "name": "I" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "one" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "two" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "three" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + }, + { + "derived": false, + "references": { + "alias": "J", + "name": "J" + }, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "missing": false, + "id": 1, + "name": "A" + }, + { + "numeric_value": 2, + "missing": false, + "id": 2, + "name": "B" + }, + { + "numeric_value": 3, + "missing": false, + "id": 3, + "name": "C" + }, + { + "numeric_value": 4, + "missing": false, + "id": 4, + "name": "D" + }, + { + "numeric_value": null, + "missing": true, + "id": -1, + "name": "No Data" + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 12, + 23, + 10, + 20, + 0, + 23, + 28, + 14, + 24, + 0, + 22, + 23, + 23, + 21, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 243, + "unfiltered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "filtered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "counts": [ + 12, + 23, + 10, + 20, + 0, + 23, + 28, + 14, + 24, + 0, + 22, + 23, + 23, + 21, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/fixtures/same-counts-3x4.json b/tests/fixtures/same-counts-3x4.json new file mode 100644 index 000000000..8219bcda9 --- /dev/null +++ b/tests/fixtures/same-counts-3x4.json @@ -0,0 +1,190 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/9c9001/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9c9001%2Fvariables%2F000000%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F9c9001%2Fvariables%2F000001%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/9c9001/variables/000000/" + }, + { + "variable": "/api/datasets/9c9001/variables/000001/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "Var1", + "name": "Var1" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "1", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "2", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "3", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "references": { + "alias": "Var2", + "name": "Var2" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "a", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "b", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "c", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "d", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 10, + 10, + 10, + 10, + 0, + 20, + 20, + 20, + 20, + 0, + 30, + 30, + 30, + 30, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 240, + "unfiltered": { + "unweighted_n": 240, + "weighted_n": 240 + }, + "filtered": { + "unweighted_n": 240, + "weighted_n": 240 + }, + "counts": [ + 10, + 10, + 10, + 10, + 0, + 20, + 20, + 20, + 20, + 0, + 30, + 30, + 30, + 30, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py new file mode 100644 index 000000000..49776a310 --- /dev/null +++ b/tests/integration/test_pairwise.py @@ -0,0 +1,26 @@ +from unittest import TestCase + +import numpy as np + +from ..fixtures import CR + +from cr.cube.crunch_cube import CrunchCube + + +# pylint: disable=missing-docstring, invalid-name, no-self-use +class TestStandardizedResiduals(TestCase): + '''Test cr.cube implementation of column family pairwise comparisons''' + + def test_same_col_counts(self): + '''Test statistics for columns that are all the same.''' + cube = CrunchCube(CR.SAME_COUNTS_3x4) + expected = np.zeros([4,4]) + actual = cube.pairwise_column_chisq() + np.testing.assert_equal(actual, expected) + + def test_same_col_pvals(self): + '''P-values for columns that are all the same.''' + cube = CrunchCube(CR.SAME_COUNTS_3x4) + expected = np.ones([4,4]) + actual = cube.pairwise_column_pvals() + np.testing.assert_equal(actual, expected) From 1fab51d05de6256fd699a5012dc29b9d16905440 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 7 Jan 2019 08:21:11 -0500 Subject: [PATCH 02/12] beginning of pairwise --- src/cr/cube/crunch_cube.py | 16 ++++++++ src/cr/cube/cube_slice.py | 59 ++++++++++++++++++++++++++---- tests/integration/test_pairwise.py | 17 +++++---- 3 files changed, 77 insertions(+), 15 deletions(-) diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index e677521d2..43c1b9624 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -713,6 +713,22 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): res = [s.zscore(weighted, prune, hs_dims) for s in self.slices] return np.array(res) if self.ndim == 3 else res[0] + def pairwise_chisq(self, axis=0, weighted=True, prune=False, hs_dims=None): + """Return square ndarray of pairwise Chi-square along axis. + + Zscore is a measure of statistical signifficance of observed vs. + expected counts. It's only applicable to a 2D contingency tables. + For 3D cubes, the measures of separate slices are stacked together + and returned as the result. + + :param weighted: Use weighted counts for zscores + :param prune: Prune based on unweighted counts + :param hs_dims: Include headers and subtotals (as NaN values) + :returns zscore: ndarray representing zscore measurements + """ + res = [s.pairwise_chisq(axis, weighted) for s in self.slices] + return res[0] + def _adjust_axis(self, axis): """Return raw axis/axes corresponding to apparent axis/axes. diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 9857de99e..12bd23288 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -307,11 +307,12 @@ def table_name(self): return "%s: %s" % (title, table_name) def pvals(self, weighted=True, prune=False, hs_dims=None): - """Return 2D ndarray with calculated p-vals. + """Return 2D ndarray with calculated P values - This function calculates statistically significant results for - categorical contingency tables. The values are calculated for 2D tables - only. + This function calculates statistically significant cells for + categorical contingency tables under the null hypothesis that the + row and column variables are independent (uncorrelated). + The values are calculated for 2D tables only. :param weighted: Use weighted counts for zscores :param prune: Prune based on unweighted counts @@ -325,15 +326,22 @@ def pvals(self, weighted=True, prune=False, hs_dims=None): return self._apply_pruning_mask(pvals, hs_dims) if prune else pvals def zscore(self, weighted=True, prune=False, hs_dims=None): - """Return ndarray with slices's zscore measurements. + """Return ndarray with slices's standardized residuals (Z-scores). - Zscore is a measure of statistical signifficance of observed vs. - expected counts. It's only applicable to a 2D contingency tables. + (Only applicable to a 2D contingency tables.) The Z-score or + standardized residual is the difference between observed and expected + cell counts if row and column variables were independent divided + by the residual cell variance. They are assumed to come from a N(0,1) + or standard Normal distribution, and can show which cells deviate from + the null hypothesis that the row and column variables are uncorrelated. + + See also *pairwise_chisq*, *pairwise_pvals* for a pairwise column- + or row-based test of statistical significance. :param weighted: Use weighted counts for zscores :param prune: Prune based on unweighted counts :param hs_dims: Include headers and subtotals (as NaN values) - :returns zscore: ndarray representing zscore measurements + :returns zscore: ndarray representing cell standardized residuals (Z) """ counts = self.as_array(weighted=weighted) total = self.margin(weighted=weighted) @@ -349,6 +357,24 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): return zscore + def pairwise_chisq(self, weighted=True, prune=False, hs_dims=None): + """Return square ndarray of pairwise Chi-squared statistics along axis. + + Zscore is a measure of statistical signifficance of observed vs. + expected counts. It's only applicable to a 2D contingency tables. + + :param weighted: Use weighted counts for zscores + :param prune: Prune based on unweighted counts + :param hs_dims: Include headers and subtotals (as NaN values) + :returns zscore: ndarray representing zscore measurements + """ + counts = self.as_array(weighted=weighted) + colsum = self.margin(axis=0, weighted=weighted) + rowsum = self.margin(axis=1, weighted=weighted) + chisq = self._categorical_pairwise_chisq(counts, axis, colsum, rowsum) + + return chisq + def _apply_pruning_mask(self, res, hs_dims=None): array = self.as_array(prune=True, include_transforms_for_dims=hs_dims) @@ -457,6 +483,23 @@ def _scalar_type_std_res(self, counts, total, colsum, rowsum): ) return residuals / np.sqrt(variance) + def _categorical_pairwise_chisq(self, counts, axis, margin, offmargin): + """Return ndarray containing pairwise comparisons along axis + + Returns a square, symmetric matrix of test statistics for the null + hypothesis that each vector along *axis* is equal to each other. + """ + if axis == 0: + proportions = counts.proportions(axis=0) + wts = offmargin / counts.margin() + elements = counts.shape[0] + chisq = np.zeros([elements, elements]) + for i, _ in np.ndenumerate(chisq): + for j, _ in np.ndenumerate(chisq): + chisq[i, j] = chisq[j, i] = np.sum( + np.square((props[i, :] - props[j, :])) / wts + ) / (1 / margin[i] + 1 / margin[j]) + def _update_args(self, kwargs): if self._cube.ndim < 3: # If cube is 2D it doesn't actually have slices (itself is a slice). diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index 49776a310..26bcb3e2f 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -1,6 +1,7 @@ from unittest import TestCase import numpy as np +import pytest from ..fixtures import CR @@ -9,18 +10,20 @@ # pylint: disable=missing-docstring, invalid-name, no-self-use class TestStandardizedResiduals(TestCase): - '''Test cr.cube implementation of column family pairwise comparisons''' + """Test cr.cube implementation of column family pairwise comparisons""" + @pytest.mark.xfail def test_same_col_counts(self): - '''Test statistics for columns that are all the same.''' + """Test statistics for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) - expected = np.zeros([4,4]) - actual = cube.pairwise_column_chisq() + expected = np.zeros([4, 4]) + actual = cube.pairwise_chisq(axis=0) np.testing.assert_equal(actual, expected) + @pytest.mark.xfail def test_same_col_pvals(self): - '''P-values for columns that are all the same.''' + """P-values for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) - expected = np.ones([4,4]) - actual = cube.pairwise_column_pvals() + expected = np.ones([4, 4]) + actual = cube.pairwise_pvals(axis=0) np.testing.assert_equal(actual, expected) From a7a11dbddc1c6c619929ce210b6cafb212d1ccd5 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 7 Jan 2019 16:52:40 -0500 Subject: [PATCH 03/12] hirotsu chisq workkksss --- src/cr/cube/crunch_cube.py | 4 +- src/cr/cube/cube_slice.py | 28 +++--- tests/integration/test_pairwise.py | 131 ++++++++++++++++++++++++++++- 3 files changed, 149 insertions(+), 14 deletions(-) diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index 43c1b9624..c35e1f1fa 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -713,7 +713,7 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): res = [s.zscore(weighted, prune, hs_dims) for s in self.slices] return np.array(res) if self.ndim == 3 else res[0] - def pairwise_chisq(self, axis=0, weighted=True, prune=False, hs_dims=None): + def pairwise_chisq(self, axis=0): """Return square ndarray of pairwise Chi-square along axis. Zscore is a measure of statistical signifficance of observed vs. @@ -726,7 +726,7 @@ def pairwise_chisq(self, axis=0, weighted=True, prune=False, hs_dims=None): :param hs_dims: Include headers and subtotals (as NaN values) :returns zscore: ndarray representing zscore measurements """ - res = [s.pairwise_chisq(axis, weighted) for s in self.slices] + res = [s.pairwise_chisq(axis=0) for s in self.slices] return res[0] def _adjust_axis(self, axis): diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 12bd23288..3682976d3 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -357,7 +357,7 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): return zscore - def pairwise_chisq(self, weighted=True, prune=False, hs_dims=None): + def pairwise_chisq(self, axis=0, weighted=True): """Return square ndarray of pairwise Chi-squared statistics along axis. Zscore is a measure of statistical signifficance of observed vs. @@ -369,9 +369,7 @@ def pairwise_chisq(self, weighted=True, prune=False, hs_dims=None): :returns zscore: ndarray representing zscore measurements """ counts = self.as_array(weighted=weighted) - colsum = self.margin(axis=0, weighted=weighted) - rowsum = self.margin(axis=1, weighted=weighted) - chisq = self._categorical_pairwise_chisq(counts, axis, colsum, rowsum) + chisq = self._categorical_pairwise_chisq(counts, axis) return chisq @@ -483,23 +481,31 @@ def _scalar_type_std_res(self, counts, total, colsum, rowsum): ) return residuals / np.sqrt(variance) - def _categorical_pairwise_chisq(self, counts, axis, margin, offmargin): + def _categorical_pairwise_chisq(self, counts, axis=0): """Return ndarray containing pairwise comparisons along axis Returns a square, symmetric matrix of test statistics for the null hypothesis that each vector along *axis* is equal to each other. """ if axis == 0: - proportions = counts.proportions(axis=0) - wts = offmargin / counts.margin() - elements = counts.shape[0] + margin = self.margin(0) + offmargin = self.margin(1) + proportions = self.proportions(axis=0) + wts = offmargin / self.margin() + elements = counts.shape[1] chisq = np.zeros([elements, elements]) - for i, _ in np.ndenumerate(chisq): - for j, _ in np.ndenumerate(chisq): + for i in xrange(1, elements): + for j in xrange(0, elements - 1): chisq[i, j] = chisq[j, i] = np.sum( - np.square((props[i, :] - props[j, :])) / wts + np.square(proportions[:, i] - proportions[:, j]) / wts ) / (1 / margin[i] + 1 / margin[j]) + print(chisq.tolist()) + return chisq + + # sum( (props[,i] - props[,j])^2 / weights ) / + # ( 1 / margins[i] + 1 / margins[j] ) + def _update_args(self, kwargs): if self._cube.ndim < 3: # If cube is 2D it doesn't actually have slices (itself is a slice). diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index 26bcb3e2f..ff32ad175 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -12,7 +12,6 @@ class TestStandardizedResiduals(TestCase): """Test cr.cube implementation of column family pairwise comparisons""" - @pytest.mark.xfail def test_same_col_counts(self): """Test statistics for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) @@ -20,6 +19,136 @@ def test_same_col_counts(self): actual = cube.pairwise_chisq(axis=0) np.testing.assert_equal(actual, expected) + def test_hirotsu_chisq(self): + """Test statistic for hirotsu data matches R""" + cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) + expected = np.array( + [ + [ + 0.0, + 2.821910158116655, + 0.9259711818781733, + 12.780855448128131, + 16.79727869630099, + 0.924655442873681, + 0.8008976269312448, + 9.616972398702428, + 1.4496863124510315, + 18.556098937181705, + ], + [ + 2.821910158116655, + 0.0, + 1.6831132737959318, + 8.683471852181562, + 13.451053159265136, + 0.38467827774871005, + 1.5094961530071807, + 9.081312924348003, + 0.25833985406056126, + 16.3533306337074, + ], + [ + 0.9259711818781733, + 1.6831132737959318, + 0.0, + 24.348935423464653, + 46.689386077899826, + 0.18470822825752797, + 1.376598707986204, + 22.063658540387774, + 1.0102118795109807, + 47.62124004565971, + ], + [ + 12.780855448128131, + 8.683471852181562, + 24.348935423464653, + 0.0, + 0.8073979083263744, + 8.490641259215641, + 5.141740694105387, + 1.2536004848874829, + 3.576241745092247, + 2.1974561987876613, + ], + [ + 16.79727869630099, + 13.451053159265136, + 46.689386077899826, + 0.8073979083263744, + 0.0, + 11.792012011326468, + 6.847609367845222, + 0.743555569450378, + 5.218390456727495, + 0.725476017865348, + ], + [ + 0.924655442873681, + 0.38467827774871005, + 0.18470822825752797, + 8.490641259215641, + 11.792012011326468, + 0.0, + 0.7072537831958036, + 7.620018353425002, + 0.3321969685319031, + 14.087591553810693, + ], + [ + 0.8008976269312448, + 1.5094961530071807, + 1.376598707986204, + 5.141740694105387, + 6.847609367845222, + 0.7072537831958036, + 0.0, + 3.6724354409467352, + 0.39674326208673527, + 8.546159019524978, + ], + [ + 9.616972398702428, + 9.081312924348003, + 22.063658540387774, + 1.2536004848874829, + 0.743555569450378, + 7.620018353425002, + 3.6724354409467352, + 0.0, + 3.4464292421171003, + 1.5916695633869193, + ], + [ + 1.4496863124510315, + 0.25833985406056126, + 1.0102118795109807, + 3.576241745092247, + 5.218390456727495, + 0.3321969685319031, + 0.39674326208673527, + 3.4464292421171003, + 0.0, + 6.85424450468994, + ], + [ + 18.556098937181705, + 16.3533306337074, + 47.62124004565971, + 2.1974561987876613, + 0.725476017865348, + 14.087591553810693, + 8.546159019524978, + 1.5916695633869193, + 6.85424450468994, + 0.0, + ], + ] + ) + actual = cube.pairwise_chisq(axis=0) + np.testing.assert_almost_equal(actual, expected) + @pytest.mark.xfail def test_same_col_pvals(self): """P-values for columns that are all the same.""" From 0376f61cad754d813a73420890f621c243938353 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 14 Jan 2019 14:58:18 -0500 Subject: [PATCH 04/12] attempt to numpify some c code --- src/cr/cube/crunch_cube.py | 15 +++++++++++++++ src/cr/cube/cube_slice.py | 6 +----- tests/integration/test_pairwise.py | 7 ++++++- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index c35e1f1fa..d21d94d1e 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -17,6 +17,7 @@ from cr.cube.dimension import AllDimensions from cr.cube.enum import DIMENSION_TYPE as DT from cr.cube.measures.index import Index +from cr.cube.measures.pairwise_pvalues import PairwisePvalues from cr.cube.measures.scale_means import ScaleMeans from cr.cube.util import lazyproperty @@ -729,6 +730,20 @@ def pairwise_chisq(self, axis=0): res = [s.pairwise_chisq(axis=0) for s in self.slices] return res[0] + def pairwise_pvals(self, axis=0): + """Return square ndarray of pairwise Chi-square along axis. + + Square, symmetric matrix along *axis* of pairwise p-values for the + null hypothesis that col[i] = col[j] for each pair of columns. + + :param axis: axis along which to perform comparison. Only columns (0) + implement currently. + :returns pvals: square, symmetric matrix of column-comparison p-values. + """ + statistics = [s.pairwise_chisq(axis=0) for s in self.slices] + shapes = [s.get_shape() for s in self.slices] + return PairwisePvalues(statistics[0], shapes[0], axis=axis).values + def _adjust_axis(self, axis): """Return raw axis/axes corresponding to apparent axis/axes. diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 3682976d3..7b21135bd 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -360,7 +360,7 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): def pairwise_chisq(self, axis=0, weighted=True): """Return square ndarray of pairwise Chi-squared statistics along axis. - Zscore is a measure of statistical signifficance of observed vs. + Zscore is a measure of statistical significance of observed vs. expected counts. It's only applicable to a 2D contingency tables. :param weighted: Use weighted counts for zscores @@ -500,12 +500,8 @@ def _categorical_pairwise_chisq(self, counts, axis=0): np.square(proportions[:, i] - proportions[:, j]) / wts ) / (1 / margin[i] + 1 / margin[j]) - print(chisq.tolist()) return chisq - # sum( (props[,i] - props[,j])^2 / weights ) / - # ( 1 / margins[i] + 1 / margins[j] ) - def _update_args(self, kwargs): if self._cube.ndim < 3: # If cube is 2D it doesn't actually have slices (itself is a slice). diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index ff32ad175..51120e2cd 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -149,10 +149,15 @@ def test_hirotsu_chisq(self): actual = cube.pairwise_chisq(axis=0) np.testing.assert_almost_equal(actual, expected) - @pytest.mark.xfail def test_same_col_pvals(self): """P-values for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) expected = np.ones([4, 4]) actual = cube.pairwise_pvals(axis=0) np.testing.assert_equal(actual, expected) + + def test_hirotsu_pvals(self): + cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) + actual = cube.pairwise_pvals(axis=0) + expected = np.ones([10, 10]) + np.testing.assert_almost_equal(actual, expected) From 5a6600ab24d208911ef1f3a656ee72394b6d8ef2 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 14 Jan 2019 15:20:38 -0500 Subject: [PATCH 05/12] mark xfail for actual values --- tests/integration/test_pairwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index 51120e2cd..e1a88c816 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -156,6 +156,7 @@ def test_same_col_pvals(self): actual = cube.pairwise_pvals(axis=0) np.testing.assert_equal(actual, expected) + @pytest.mark.xfail def test_hirotsu_pvals(self): cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) actual = cube.pairwise_pvals(axis=0) From 151b3d0b5b5926ee3bb1909f411f97694c43463f Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 14 Jan 2019 15:36:26 -0500 Subject: [PATCH 06/12] add files --- src/cr/cube/distributions/__init__.py | 0 src/cr/cube/distributions/wishart.py | 134 +++++++++++++++++++++++ src/cr/cube/measures/pairwise_pvalues.py | 23 ++++ 3 files changed, 157 insertions(+) create mode 100644 src/cr/cube/distributions/__init__.py create mode 100644 src/cr/cube/distributions/wishart.py create mode 100644 src/cr/cube/measures/pairwise_pvalues.py diff --git a/src/cr/cube/distributions/__init__.py b/src/cr/cube/distributions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py new file mode 100644 index 000000000..c0c07bbdb --- /dev/null +++ b/src/cr/cube/distributions/wishart.py @@ -0,0 +1,134 @@ +# encoding: utf-8 + +"""The CDF of the largest eigenvalue of a Wishart distribution. + +Compute the CDF of the Wishart distribution using the algorithm of +Chiani (2014). The Wishart CDF is not defined in scipy.stats.wishart, nor in +tensorflow_probability.distributions.wishart. At the time of writing, the only +implementation of this CDF is the R package rootWishart. This largely +follows that, but using scipy implementations rather than Boost, and omitting +multi-precision support. +""" + +from __future__ import division + +import numpy as np +from scipy.special import gamma, gammainc +from scipy.linalg import det +from scipy.constants import pi + + +def wishartCDF(X, n_min, n_max): + w_p = np.zeros(X.shape) + for i, x in np.ndenumerate(X): + w_p[i] = _wishart_pfaffian(x, n_min, n_max) + return _normalizing_const(n_min, n_max) * w_p + + +def _wishart_pfaffian(x, n_min, n_max): + """Value providing the CDF of the largest eigenvalue of a Wishart(n, p) + evaluated at x. + + if (n_min != n_mat) { + // Fill in extra column + VectorXmp alpha_vec(n_min); + T alpha_last = 0.5*(n_max + n_min + 1); + for (int i = 0; i < n_min; i++) { + alpha_vec(i) = alpha + i + 1; + A(i, n_min) = pow(2, -alpha_last)*boost::math::gamma_p(alpha_vec(i), 0.5*xx)/boost::math::tgamma(alpha_last); + A(n_min, i) = - A(i, n_min); + } + } + """ + size = n_min + (n_min % 2) + alpha = 0.5 * (n_max - n_min - 1) + A = np.zeros([size, size]) + alpha_ind = np.arange(n_min, dtype=float) + alpha_vec = alpha_ind + alpha + 1 + gammainc_a = gammainc(alpha_vec, x / 2) + + if n_min != size: + alpha_ind = alpha_ind.astype(np.int) + other_ind = np.full(n_min, size - 1, dtype=np.int) + alpha_last = (n_max + n_min + 1) / 2 + A[other_ind, alpha_ind] = ( + np.power(2, -alpha_last) * gammainc_a / gamma(alpha_last) + ) + # TODO check that this is not transposed + A[alpha_ind, other_ind] = -A[other_ind, alpha_ind] + # print(A) + """ + + for (int i = 0; i < n_min; i++) { + p(i) = boost::math::gamma_p(alpha + i + 1, 0.5*xx); + g(i) = boost::math::tgamma(alpha + i + 1); + } + for (int i = 0; i < (2*n_min - 2); i++) { + q(i) = pow(0.5, 2*alpha + i + 2) * boost::math::tgamma(2*alpha+i+2)*boost::math::gamma_p(2*alpha + i + 2, xx); + } + """ + p = gammainc_a + g = gamma(alpha_vec) + q_ind = np.arange(2 * n_min - 2) + q = ( + np.power(0.5, 2 * alpha + q_ind + 2) + * gamma(2 * alpha + q_ind + 2) + * gammainc(2 * alpha + q_ind + 2, x) + ) + + """ + + + for (int i = 0; i < n_min; i++) { + b = 0.5*p(i)*p(i); + for(int j = i; j < (n_min - 1); j++) { + b -= q(i+j)/(g(i)*g(j+1)); + A(i, j+1) = p(i)*p(j+1) - 2*b; + A(j+1, i) = -A(i, j+1); + } + Rcpp::checkUserInterrupt(); + } + """ + + # Lets do the inefficient iteration first and figure out index tricks later + for i in range(n_min): + b = 0.5 * p[i] * p[i] + for j in range(i, n_min - 1): + b -= q[i + j] / g[i] * g[j + 1] + A[j + 1, i] = p[i] * p[j + 1] - 2 * b + A[i, j + 1] = -A[j + 1, i] + + return np.sqrt(det(A)) + + +def _normalizing_const(n_min, n_max): + """int n_mat = n_min + (n_min % 2); + double alpha = 0.5*(n_max - n_min - 1); + // Compute constant + double K1 = pow(M_PI, 0.5*n_min*n_min); + K1 /= pow(2, 0.5*n_min*n_max)*mgamma_C(0.5*n_max, n_min, false)*mgamma_C(0.5*n_min, n_min, false); + double K2 = pow(2, alpha*n_mat+0.5*n_mat*(n_mat+1)); + for (int k = 0; k < n_mat; k++) { + K2 *= boost::math::tgamma(alpha + k + 1); + } + return K1*K2;""" + size = n_min + (n_min % 2) + alpha = 0.5 * (n_max - n_min - 1) + K1 = ( + np.power(pi, 0.5 * n_min * n_min) + / np.power(2, 0.5 * n_min * n_max) + * _mgamma(0.5 * n_max, n_min) + * _mgamma(0.5 * n_min, n_min) + ) + K2 = np.power(2, alpha * size + 0.5 * size * (size + 1)) + for i in range(size): + K2 *= gamma(alpha + i + 1) + print K1*K2 + return K1 * K2 + + +def _mgamma(x, m): + res = pow(pi, 0.25 * m * (m - 1)) + for i in range(m): + res *= gamma(x - 0.5 * i) + return res diff --git a/src/cr/cube/measures/pairwise_pvalues.py b/src/cr/cube/measures/pairwise_pvalues.py new file mode 100644 index 000000000..c94ea38a0 --- /dev/null +++ b/src/cr/cube/measures/pairwise_pvalues.py @@ -0,0 +1,23 @@ +# encoding: utf-8 + +"""P-values of pairwise comparison or columns of a contingency table +""" + +from cr.cube.distributions.wishart import wishartCDF +from cr.cube.util import lazyproperty + + +class PairwisePvalues(object): + """Value object providing matrix of pairwise-comparison P-values""" + + def __init__(self, chisq, shape, axis=0): + if axis != 0: + raise NotImplementedError("Pairwise p-values only implemented for colums") + self.axis = axis + self._chisq = chisq + self.n_max = min(shape) - 1 + self.n_min = max(shape) - 1 + + @lazyproperty + def values(self): + return 1.0 - wishartCDF(self._chisq, self.n_max, self.n_min) From fc3126a7dcc307ee6e71ccbcaa7ab086c6d68542 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Mon, 14 Jan 2019 16:41:31 -0500 Subject: [PATCH 07/12] :boom: get the pvals --- src/cr/cube/distributions/wishart.py | 26 +++--- src/cr/cube/measures/pairwise_pvalues.py | 6 +- tests/integration/test_pairwise.py | 106 ++++++++++++++++++++++- 3 files changed, 118 insertions(+), 20 deletions(-) diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py index c0c07bbdb..4d9f46a99 100644 --- a/src/cr/cube/distributions/wishart.py +++ b/src/cr/cube/distributions/wishart.py @@ -43,20 +43,18 @@ def _wishart_pfaffian(x, n_min, n_max): size = n_min + (n_min % 2) alpha = 0.5 * (n_max - n_min - 1) A = np.zeros([size, size]) - alpha_ind = np.arange(n_min, dtype=float) + alpha_ind = np.arange(n_min, dtype=np.float) alpha_vec = alpha_ind + alpha + 1 - gammainc_a = gammainc(alpha_vec, x / 2) - + gammainc_a = gammainc(alpha_vec, 0.5 * x) if n_min != size: alpha_ind = alpha_ind.astype(np.int) other_ind = np.full(n_min, size - 1, dtype=np.int) alpha_last = (n_max + n_min + 1) / 2 A[other_ind, alpha_ind] = ( - np.power(2, -alpha_last) * gammainc_a / gamma(alpha_last) + np.float_power(2, -alpha_last) * gammainc_a / gamma(alpha_last) ) # TODO check that this is not transposed A[alpha_ind, other_ind] = -A[other_ind, alpha_ind] - # print(A) """ for (int i = 0; i < n_min; i++) { @@ -71,11 +69,10 @@ def _wishart_pfaffian(x, n_min, n_max): g = gamma(alpha_vec) q_ind = np.arange(2 * n_min - 2) q = ( - np.power(0.5, 2 * alpha + q_ind + 2) + np.float_power(0.5, 2 * alpha + q_ind + 2) * gamma(2 * alpha + q_ind + 2) * gammainc(2 * alpha + q_ind + 2, x) ) - """ @@ -94,10 +91,9 @@ def _wishart_pfaffian(x, n_min, n_max): for i in range(n_min): b = 0.5 * p[i] * p[i] for j in range(i, n_min - 1): - b -= q[i + j] / g[i] * g[j + 1] + b -= q[i + j] / (g[i] * g[j + 1]) A[j + 1, i] = p[i] * p[j + 1] - 2 * b A[i, j + 1] = -A[j + 1, i] - return np.sqrt(det(A)) @@ -114,21 +110,21 @@ def _normalizing_const(n_min, n_max): return K1*K2;""" size = n_min + (n_min % 2) alpha = 0.5 * (n_max - n_min - 1) - K1 = ( - np.power(pi, 0.5 * n_min * n_min) - / np.power(2, 0.5 * n_min * n_max) + K1 = np.float_power(pi, 0.5 * n_min * n_min) + K1 /= ( + np.float_power(2, 0.5 * n_min * n_max) * _mgamma(0.5 * n_max, n_min) * _mgamma(0.5 * n_min, n_min) ) - K2 = np.power(2, alpha * size + 0.5 * size * (size + 1)) + + K2 = np.float_power(2, alpha * size + 0.5 * size * (size + 1)) for i in range(size): K2 *= gamma(alpha + i + 1) - print K1*K2 return K1 * K2 def _mgamma(x, m): - res = pow(pi, 0.25 * m * (m - 1)) + res = np.float_power(pi, 0.25 * m * (m - 1)) for i in range(m): res *= gamma(x - 0.5 * i) return res diff --git a/src/cr/cube/measures/pairwise_pvalues.py b/src/cr/cube/measures/pairwise_pvalues.py index c94ea38a0..32dd69ece 100644 --- a/src/cr/cube/measures/pairwise_pvalues.py +++ b/src/cr/cube/measures/pairwise_pvalues.py @@ -15,9 +15,9 @@ def __init__(self, chisq, shape, axis=0): raise NotImplementedError("Pairwise p-values only implemented for colums") self.axis = axis self._chisq = chisq - self.n_max = min(shape) - 1 - self.n_min = max(shape) - 1 + self.n_max = max(shape) - 1 + self.n_min = min(shape) - 1 @lazyproperty def values(self): - return 1.0 - wishartCDF(self._chisq, self.n_max, self.n_min) + return 1.0 - wishartCDF(self._chisq, self.n_min, self.n_max) diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index e1a88c816..011bf2253 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -156,9 +156,111 @@ def test_same_col_pvals(self): actual = cube.pairwise_pvals(axis=0) np.testing.assert_equal(actual, expected) - @pytest.mark.xfail def test_hirotsu_pvals(self): cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) actual = cube.pairwise_pvals(axis=0) - expected = np.ones([10, 10]) + expected = np.array( + [ + 1, + 0.999603716443816, + 0.99999993076784, + 0.435830186989942, + 0.171365670494448, + 0.999999931581745, + 0.999999979427862, + 0.726740806122402, + 0.999997338047414, + 0.105707739899106, + 0.999603716443816, + 1, + 0.999991395396033, + 0.806407150042716, + 0.380296648898666, + 0.999999999961875, + 0.999996333717649, + 0.773583582093158, + 0.999999999998836, + 0.192375246184738, + 0.99999993076784, + 0.999991395396033, + 1, + 0.017277623171216, + 3.29012189337341e-06, + 0.99999999999994, + 0.999998237045896, + 0.0365273119329589, + 0.999999857555538, + 2.23456306602809e-06, + 0.435830186989942, + 0.806407150042716, + 0.017277623171216, + 1, + 0.999999977981595, + 0.821586701043061, + 0.982573114952466, + 0.999999169027016, + 0.998041030837588, + 0.999934687968906, + 0.171365670494448, + 0.380296648898666, + 3.29012189337341e-06, + 0.999999977981595, + 1, + 0.52406354520284, + 0.926322806048378, + 0.99999998900118, + 0.981100354607917, + 0.999999991067971, + 0.999999931581745, + 0.999999999961875, + 0.99999999999994, + 0.821586701043061, + 0.52406354520284, + 1, + 0.999999992799126, + 0.883025655503086, + 0.99999999998941, + 0.33149560264078, + 0.999999979427862, + 0.999996333717649, + 0.999998237045896, + 0.982573114952466, + 0.926322806048378, + 0.999999992799126, + 1, + 0.997674862917282, + 0.99999999995011, + 0.81726901111794, + 0.726740806122402, + 0.773583582093158, + 0.0365273119329589, + 0.999999169027016, + 0.99999998900118, + 0.883025655503086, + 0.997674862917282, + 1, + 0.998461227115608, + 0.999994436499243, + 0.999997338047414, + 0.999999999998836, + 0.999999857555538, + 0.998041030837588, + 0.981100354607917, + 0.99999999998941, + 0.99999999995011, + 0.998461227115608, + 1, + 0.925999125959122, + 0.105707739899106, + 0.192375246184738, + 2.23456306602809e-06, + 0.999934687968906, + 0.999999991067971, + 0.33149560264078, + 0.81726901111794, + 0.999994436499243, + 0.925999125959122, + 1, + ] + ).reshape(10, 10) np.testing.assert_almost_equal(actual, expected) From d01975f835a6407ab4dbf53ce442f58a6359668d Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Tue, 15 Jan 2019 09:09:10 -0500 Subject: [PATCH 08/12] [x]range --- src/cr/cube/cube_slice.py | 5 +++++ src/cr/cube/distributions/wishart.py | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 7b21135bd..ec056a04c 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -16,6 +16,11 @@ from cr.cube.measures.scale_means import ScaleMeans from cr.cube.util import compress_pruned, lazyproperty, memoize +try: + xrange +except NameError: + xrange = range + class CubeSlice(object): """Two-dimensional projection of a cube. diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py index 4d9f46a99..ca8de595c 100644 --- a/src/cr/cube/distributions/wishart.py +++ b/src/cr/cube/distributions/wishart.py @@ -17,6 +17,11 @@ from scipy.linalg import det from scipy.constants import pi +try: + xrange +except NameError: + xrange = range + def wishartCDF(X, n_min, n_max): w_p = np.zeros(X.shape) @@ -88,9 +93,9 @@ def _wishart_pfaffian(x, n_min, n_max): """ # Lets do the inefficient iteration first and figure out index tricks later - for i in range(n_min): + for i in xrange(n_min): b = 0.5 * p[i] * p[i] - for j in range(i, n_min - 1): + for j in xrange(i, n_min - 1): b -= q[i + j] / (g[i] * g[j + 1]) A[j + 1, i] = p[i] * p[j + 1] - 2 * b A[i, j + 1] = -A[j + 1, i] @@ -118,13 +123,13 @@ def _normalizing_const(n_min, n_max): ) K2 = np.float_power(2, alpha * size + 0.5 * size * (size + 1)) - for i in range(size): + for i in xrange(size): K2 *= gamma(alpha + i + 1) return K1 * K2 def _mgamma(x, m): res = np.float_power(pi, 0.25 * m * (m - 1)) - for i in range(m): + for i in xrange(m): res *= gamma(x - 0.5 * i) return res From 51e4d6021968bddb427c692db739c8aa0521f216 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Tue, 15 Jan 2019 11:42:59 -0500 Subject: [PATCH 09/12] raise for not-columns; tighten one loop --- src/cr/cube/cube_slice.py | 2 ++ src/cr/cube/distributions/wishart.py | 54 +++------------------------- 2 files changed, 6 insertions(+), 50 deletions(-) diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index ec056a04c..891554508 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -492,6 +492,8 @@ def _categorical_pairwise_chisq(self, counts, axis=0): Returns a square, symmetric matrix of test statistics for the null hypothesis that each vector along *axis* is equal to each other. """ + if axis != 0: + raise NotImplementedError("Pairwise comparison only implemented for colums") if axis == 0: margin = self.margin(0) offmargin = self.margin(1) diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py index ca8de595c..6c29a015a 100644 --- a/src/cr/cube/distributions/wishart.py +++ b/src/cr/cube/distributions/wishart.py @@ -33,18 +33,8 @@ def wishartCDF(X, n_min, n_max): def _wishart_pfaffian(x, n_min, n_max): """Value providing the CDF of the largest eigenvalue of a Wishart(n, p) evaluated at x. - - if (n_min != n_mat) { - // Fill in extra column - VectorXmp alpha_vec(n_min); - T alpha_last = 0.5*(n_max + n_min + 1); - for (int i = 0; i < n_min; i++) { - alpha_vec(i) = alpha + i + 1; - A(i, n_min) = pow(2, -alpha_last)*boost::math::gamma_p(alpha_vec(i), 0.5*xx)/boost::math::tgamma(alpha_last); - A(n_min, i) = - A(i, n_min); - } - } """ + size = n_min + (n_min % 2) alpha = 0.5 * (n_max - n_min - 1) A = np.zeros([size, size]) @@ -58,41 +48,15 @@ def _wishart_pfaffian(x, n_min, n_max): A[other_ind, alpha_ind] = ( np.float_power(2, -alpha_last) * gammainc_a / gamma(alpha_last) ) - # TODO check that this is not transposed A[alpha_ind, other_ind] = -A[other_ind, alpha_ind] - """ - for (int i = 0; i < n_min; i++) { - p(i) = boost::math::gamma_p(alpha + i + 1, 0.5*xx); - g(i) = boost::math::tgamma(alpha + i + 1); - } - for (int i = 0; i < (2*n_min - 2); i++) { - q(i) = pow(0.5, 2*alpha + i + 2) * boost::math::tgamma(2*alpha+i+2)*boost::math::gamma_p(2*alpha + i + 2, xx); - } - """ p = gammainc_a g = gamma(alpha_vec) q_ind = np.arange(2 * n_min - 2) - q = ( - np.float_power(0.5, 2 * alpha + q_ind + 2) - * gamma(2 * alpha + q_ind + 2) - * gammainc(2 * alpha + q_ind + 2, x) - ) - """ - - - for (int i = 0; i < n_min; i++) { - b = 0.5*p(i)*p(i); - for(int j = i; j < (n_min - 1); j++) { - b -= q(i+j)/(g(i)*g(j+1)); - A(i, j+1) = p(i)*p(j+1) - 2*b; - A(j+1, i) = -A(i, j+1); - } - Rcpp::checkUserInterrupt(); - } - """ + q_vec = 2 * alpha + q_ind + 2 + q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, x) - # Lets do the inefficient iteration first and figure out index tricks later + # TODO consider index tricks instead of iteration here for i in xrange(n_min): b = 0.5 * p[i] * p[i] for j in xrange(i, n_min - 1): @@ -103,16 +67,6 @@ def _wishart_pfaffian(x, n_min, n_max): def _normalizing_const(n_min, n_max): - """int n_mat = n_min + (n_min % 2); - double alpha = 0.5*(n_max - n_min - 1); - // Compute constant - double K1 = pow(M_PI, 0.5*n_min*n_min); - K1 /= pow(2, 0.5*n_min*n_max)*mgamma_C(0.5*n_max, n_min, false)*mgamma_C(0.5*n_min, n_min, false); - double K2 = pow(2, alpha*n_mat+0.5*n_mat*(n_mat+1)); - for (int k = 0; k < n_mat; k++) { - K2 *= boost::math::tgamma(alpha + k + 1); - } - return K1*K2;""" size = n_min + (n_min % 2) alpha = 0.5 * (n_max - n_min - 1) K1 = np.float_power(pi, 0.5 * n_min * n_min) From 91267cd2345deabe8ebd70f05cc4f97cac945e35 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Tue, 15 Jan 2019 14:39:53 -0500 Subject: [PATCH 10/12] test_odd_latent_dimensions --- tests/fixtures/pairwise-4x4.json | 238 +++++++++++++++++++++++++++++ tests/integration/test_pairwise.py | 28 ++++ 2 files changed, 266 insertions(+) create mode 100644 tests/fixtures/pairwise-4x4.json diff --git a/tests/fixtures/pairwise-4x4.json b/tests/fixtures/pairwise-4x4.json new file mode 100644 index 000000000..9f5ac2bef --- /dev/null +++ b/tests/fixtures/pairwise-4x4.json @@ -0,0 +1,238 @@ +{ + "element": "shoji:view", + "self": "/api/datasets/4bb830/cube/?query=%7B%22dimensions%22%3A%5B%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F000001%2F%22%7D%2C%7B%22variable%22%3A%22%2Fapi%2Fdatasets%2F4bb830%2Fvariables%2F066c45%2F%22%7D%5D%2C%22measures%22%3A%7B%22count%22%3A%7B%22function%22%3A%22cube_count%22%2C%22args%22%3A%5B%5D%7D%7D%2C%22weight%22%3Anull%7D&filter=%7B%7D", + "value": { + "query": { + "measures": { + "count": { + "function": "cube_count", + "args": [ + + ] + } + }, + "dimensions": [ + { + "variable": "/api/datasets/4bb830/variables/000001/" + }, + { + "variable": "/api/datasets/4bb830/variables/066c45/" + } + ], + "weight": null + }, + "query_environment": { + "filter": [ + + ] + }, + "result": { + "dimensions": [ + { + "references": { + "alias": "J", + "name": "J" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "A", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "B", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "C", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "D", + "missing": false + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + }, + { + "references": { + "alias": "K", + "name": "K" + }, + "derived": false, + "type": { + "ordinal": false, + "class": "categorical", + "categories": [ + { + "numeric_value": 1, + "id": 1, + "name": "A", + "missing": false + }, + { + "numeric_value": 2, + "id": 2, + "name": "B", + "missing": false + }, + { + "numeric_value": 3, + "id": 3, + "name": "C", + "missing": false + }, + { + "numeric_value": 4, + "id": 4, + "name": "D", + "missing": false + }, + { + "numeric_value": 5, + "id": 5, + "name": "E", + "missing": true + }, + { + "numeric_value": 6, + "id": 6, + "name": "F", + "missing": true + }, + { + "numeric_value": null, + "id": -1, + "name": "No Data", + "missing": true + } + ] + } + } + ], + "missing": 0, + "measures": { + "count": { + "data": [ + 14, + 14, + 13, + 16, + 0, + 0, + 0, + 22, + 14, + 19, + 19, + 0, + 0, + 0, + 14, + 6, + 19, + 8, + 0, + 0, + 0, + 17, + 9, + 28, + 11, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "n_missing": 0, + "metadata": { + "references": { + + }, + "derived": true, + "type": { + "integer": true, + "missing_rules": { + + }, + "missing_reasons": { + "No Data": -1 + }, + "class": "numeric" + } + } + } + }, + "n": 243, + "unfiltered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "filtered": { + "unweighted_n": 243, + "weighted_n": 243 + }, + "counts": [ + 14, + 14, + 13, + 16, + 0, + 0, + 0, + 22, + 14, + 19, + 19, + 0, + 0, + 0, + 14, + 6, + 19, + 8, + 0, + 0, + 0, + 17, + 9, + 28, + 11, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "element": "crunch:cube" + } + } +} diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index 011bf2253..b5ae7ac7f 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -264,3 +264,31 @@ def test_hirotsu_pvals(self): ] ).reshape(10, 10) np.testing.assert_almost_equal(actual, expected) + + def test_odd_latent_dimensions(self): + """Test code path for the pfaffian of an odd-dimensioned matrix + Latent matrix size is (n_min - 1) so 3 for a 4x4 table. + """ + cube = CrunchCube(CR.PAIRWISE_4X4) + actual = cube.pairwise_pvals(axis=0) + expected = np.array( + [ + 1, + 0.949690252544668, + 0.917559534718816, + 0.97630069244232, + 0.949690252544668, + 1, + 0.35511045473507, + 0.999999119644564, + 0.917559534718816, + 0.35511045473507, + 1, + 0.313869429972919, + 0.97630069244232, + 0.999999119644564, + 0.313869429972919, + 1, + ] + ).reshape() + np.testing.assert_almost_equal(actual, expected) From 85671a862c9baf85462fe52fcf70563a7063d886 Mon Sep 17 00:00:00 2001 From: Michael Malecki Date: Tue, 15 Jan 2019 19:49:03 +0000 Subject: [PATCH 11/12] oops,reshape expect --- tests/integration/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index b5ae7ac7f..a42c25cbc 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -290,5 +290,5 @@ def test_odd_latent_dimensions(self): 0.313869429972919, 1, ] - ).reshape() + ).reshape(4, 4) np.testing.assert_almost_equal(actual, expected) From 05fadb2cbac89231e8278c035df2b6ef285d8cbd Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Wed, 16 Jan 2019 15:34:50 +0100 Subject: [PATCH 12/12] Rrefactor of the pairwise comparisons * Do more work in PairwisePvalues and less in cube * Take slice as a parameter, instead of calculated stats * Don't raise axis error from PairwisePvalues, only do that at the top level (from the slice, and consequently the cube) * Move properties from CubeSlice to PairwisePvalues where possible * Bring coverage back to 100% * Make as much functionality in PairwisePvalues (as possible) a lazyproperty * Make class member fields private * Normalize PairwisePvalues * Implement WishartCDF as a value object * Extract some functionality to lazyproperties * Add a class Pfaffian, for better composition * Insert links to the PDF of the paper that describes the algorithms * Insert pylint's directive for ignoring invalid-name at module level * Keep mathematical names for properties, with adequate docstrings --- src/cr/cube/crunch_cube.py | 28 +-- src/cr/cube/cube_slice.py | 58 ++--- src/cr/cube/distributions/wishart.py | 216 ++++++++++++------ src/cr/cube/measures/pairwise_pvalues.py | 95 +++++++- tests/integration/test_pairwise.py | 276 ++++++++++++----------- 5 files changed, 399 insertions(+), 274 deletions(-) diff --git a/src/cr/cube/crunch_cube.py b/src/cr/cube/crunch_cube.py index d21d94d1e..3f057ce04 100644 --- a/src/cr/cube/crunch_cube.py +++ b/src/cr/cube/crunch_cube.py @@ -17,7 +17,6 @@ from cr.cube.dimension import AllDimensions from cr.cube.enum import DIMENSION_TYPE as DT from cr.cube.measures.index import Index -from cr.cube.measures.pairwise_pvalues import PairwisePvalues from cr.cube.measures.scale_means import ScaleMeans from cr.cube.util import lazyproperty @@ -714,35 +713,16 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): res = [s.zscore(weighted, prune, hs_dims) for s in self.slices] return np.array(res) if self.ndim == 3 else res[0] - def pairwise_chisq(self, axis=0): - """Return square ndarray of pairwise Chi-square along axis. - - Zscore is a measure of statistical signifficance of observed vs. - expected counts. It's only applicable to a 2D contingency tables. - For 3D cubes, the measures of separate slices are stacked together - and returned as the result. - - :param weighted: Use weighted counts for zscores - :param prune: Prune based on unweighted counts - :param hs_dims: Include headers and subtotals (as NaN values) - :returns zscore: ndarray representing zscore measurements - """ - res = [s.pairwise_chisq(axis=0) for s in self.slices] - return res[0] - def pairwise_pvals(self, axis=0): - """Return square ndarray of pairwise Chi-square along axis. + """Return matrices of column-comparison p-values as list of numpy.ndarrays. Square, symmetric matrix along *axis* of pairwise p-values for the null hypothesis that col[i] = col[j] for each pair of columns. - :param axis: axis along which to perform comparison. Only columns (0) - implement currently. - :returns pvals: square, symmetric matrix of column-comparison p-values. + *axis* (int): axis along which to perform comparison. Only columns (0) + are implemented currently. """ - statistics = [s.pairwise_chisq(axis=0) for s in self.slices] - shapes = [s.get_shape() for s in self.slices] - return PairwisePvalues(statistics[0], shapes[0], axis=axis).values + return [slice_.pairwise_pvals(axis=axis) for slice_ in self.slices] def _adjust_axis(self, axis): """Return raw axis/axes corresponding to apparent axis/axes. diff --git a/src/cr/cube/cube_slice.py b/src/cr/cube/cube_slice.py index 891554508..8751949da 100644 --- a/src/cr/cube/cube_slice.py +++ b/src/cr/cube/cube_slice.py @@ -14,13 +14,9 @@ from cr.cube.enum import DIMENSION_TYPE as DT from cr.cube.measures.scale_means import ScaleMeans +from cr.cube.measures.pairwise_pvalues import PairwisePvalues from cr.cube.util import compress_pruned, lazyproperty, memoize -try: - xrange -except NameError: - xrange = range - class CubeSlice(object): """Two-dimensional projection of a cube. @@ -311,6 +307,19 @@ def table_name(self): table_name = self._cube.labels()[0][self._index] return "%s: %s" % (title, table_name) + def pairwise_pvals(self, axis=0): + """Return square symmetric matrix of pairwise column-comparison p-values. + + Square, symmetric matrix along *axis* of pairwise p-values for the + null hypothesis that col[i] = col[j] for each pair of columns. + + *axis* (int): axis along which to perform comparison. Only columns (0) + are implemented currently. + """ + if axis != 0: + raise NotImplementedError("Pairwise comparison only implemented for colums") + return PairwisePvalues(self, axis=axis).values + def pvals(self, weighted=True, prune=False, hs_dims=None): """Return 2D ndarray with calculated P values @@ -362,22 +371,6 @@ def zscore(self, weighted=True, prune=False, hs_dims=None): return zscore - def pairwise_chisq(self, axis=0, weighted=True): - """Return square ndarray of pairwise Chi-squared statistics along axis. - - Zscore is a measure of statistical significance of observed vs. - expected counts. It's only applicable to a 2D contingency tables. - - :param weighted: Use weighted counts for zscores - :param prune: Prune based on unweighted counts - :param hs_dims: Include headers and subtotals (as NaN values) - :returns zscore: ndarray representing zscore measurements - """ - counts = self.as_array(weighted=weighted) - chisq = self._categorical_pairwise_chisq(counts, axis) - - return chisq - def _apply_pruning_mask(self, res, hs_dims=None): array = self.as_array(prune=True, include_transforms_for_dims=hs_dims) @@ -486,29 +479,6 @@ def _scalar_type_std_res(self, counts, total, colsum, rowsum): ) return residuals / np.sqrt(variance) - def _categorical_pairwise_chisq(self, counts, axis=0): - """Return ndarray containing pairwise comparisons along axis - - Returns a square, symmetric matrix of test statistics for the null - hypothesis that each vector along *axis* is equal to each other. - """ - if axis != 0: - raise NotImplementedError("Pairwise comparison only implemented for colums") - if axis == 0: - margin = self.margin(0) - offmargin = self.margin(1) - proportions = self.proportions(axis=0) - wts = offmargin / self.margin() - elements = counts.shape[1] - chisq = np.zeros([elements, elements]) - for i in xrange(1, elements): - for j in xrange(0, elements - 1): - chisq[i, j] = chisq[j, i] = np.sum( - np.square(proportions[:, i] - proportions[:, j]) / wts - ) / (1 / margin[i] + 1 / margin[j]) - - return chisq - def _update_args(self, kwargs): if self._cube.ndim < 3: # If cube is 2D it doesn't actually have slices (itself is a slice). diff --git a/src/cr/cube/distributions/wishart.py b/src/cr/cube/distributions/wishart.py index 6c29a015a..a385df3b0 100644 --- a/src/cr/cube/distributions/wishart.py +++ b/src/cr/cube/distributions/wishart.py @@ -1,4 +1,5 @@ # encoding: utf-8 +# pylint: disable=too-few-public-methods, invalid-name """The CDF of the largest eigenvalue of a Wishart distribution. @@ -8,6 +9,14 @@ implementation of this CDF is the R package rootWishart. This largely follows that, but using scipy implementations rather than Boost, and omitting multi-precision support. + +For more info about the formulae, and the terse variable naming, +please refer to the paper: + +"Distribution of the largest eigenvalue for real Wishart and Gaussian random +matrices and a simple approximation for the Tracy-Widom distribution" + +found at: https://arxiv.org/pdf/1209.3394.pdf """ from __future__ import division @@ -16,74 +25,153 @@ from scipy.special import gamma, gammainc from scipy.linalg import det from scipy.constants import pi +from cr.cube.util import lazyproperty try: xrange -except NameError: +except NameError: # pragma: no cover xrange = range -def wishartCDF(X, n_min, n_max): - w_p = np.zeros(X.shape) - for i, x in np.ndenumerate(X): - w_p[i] = _wishart_pfaffian(x, n_min, n_max) - return _normalizing_const(n_min, n_max) * w_p - - -def _wishart_pfaffian(x, n_min, n_max): - """Value providing the CDF of the largest eigenvalue of a Wishart(n, p) - evaluated at x. - """ - - size = n_min + (n_min % 2) - alpha = 0.5 * (n_max - n_min - 1) - A = np.zeros([size, size]) - alpha_ind = np.arange(n_min, dtype=np.float) - alpha_vec = alpha_ind + alpha + 1 - gammainc_a = gammainc(alpha_vec, 0.5 * x) - if n_min != size: - alpha_ind = alpha_ind.astype(np.int) - other_ind = np.full(n_min, size - 1, dtype=np.int) - alpha_last = (n_max + n_min + 1) / 2 - A[other_ind, alpha_ind] = ( - np.float_power(2, -alpha_last) * gammainc_a / gamma(alpha_last) +class WishartCDF: + """Implementation of Cumulative Distribution Function (CDF).""" + + def __init__(self, chisq, n_min, n_max): + self._chisq = chisq + self.n_min = n_min + self._n_max = n_max + + @lazyproperty + def values(self): + """ndarray of wishart CDF values""" + return self.K * self.wishart_pfaffian + + @lazyproperty + def wishart_pfaffian(self): + """ndarray of wishart pfaffian CDF, before normalization""" + return np.array( + [Pfaffian(self, val).value for i, val in np.ndenumerate(self._chisq)] + ).reshape(self._chisq.shape) + + @lazyproperty + def alpha(self): + """int representing common basis of off-diagonal elements of A""" + return 0.5 * (self._n_max - self.n_min - 1) + + @lazyproperty + def alpha_ind(self): + """indices of off-diagonal elements of A""" + return np.arange(self.n_min) + + @lazyproperty + def alpha_last(self): + """elements of row and column to append to A when its order/size is odd""" + return (self._n_max + self.n_min + 1) / 2 + + @lazyproperty + def alpha_vec(self): + """elements of row and column to append to A when its order/size is odd""" + return self.alpha_ind + self.alpha + 1 + + @lazyproperty + def other_ind(self): + """last row or column of square A""" + return np.full(self.n_min, self.size - 1, dtype=np.int) + + @staticmethod + def _mgamma(x, m): + mgamma = np.float_power(pi, 0.25 * m * (m - 1)) + for i in xrange(m): + mgamma *= gamma(x - 0.5 * i) + return mgamma + + @lazyproperty + def K(self): + """Normalizing constant for wishart CDF.""" + K1 = np.float_power(pi, 0.5 * self.n_min * self.n_min) + K1 /= ( + np.float_power(2, 0.5 * self.n_min * self._n_max) + * self._mgamma(0.5 * self._n_max, self.n_min) + * self._mgamma(0.5 * self.n_min, self.n_min) ) - A[alpha_ind, other_ind] = -A[other_ind, alpha_ind] - - p = gammainc_a - g = gamma(alpha_vec) - q_ind = np.arange(2 * n_min - 2) - q_vec = 2 * alpha + q_ind + 2 - q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, x) - - # TODO consider index tricks instead of iteration here - for i in xrange(n_min): - b = 0.5 * p[i] * p[i] - for j in xrange(i, n_min - 1): - b -= q[i + j] / (g[i] * g[j + 1]) - A[j + 1, i] = p[i] * p[j + 1] - 2 * b - A[i, j + 1] = -A[j + 1, i] - return np.sqrt(det(A)) - - -def _normalizing_const(n_min, n_max): - size = n_min + (n_min % 2) - alpha = 0.5 * (n_max - n_min - 1) - K1 = np.float_power(pi, 0.5 * n_min * n_min) - K1 /= ( - np.float_power(2, 0.5 * n_min * n_max) - * _mgamma(0.5 * n_max, n_min) - * _mgamma(0.5 * n_min, n_min) - ) - - K2 = np.float_power(2, alpha * size + 0.5 * size * (size + 1)) - for i in xrange(size): - K2 *= gamma(alpha + i + 1) - return K1 * K2 - - -def _mgamma(x, m): - res = np.float_power(pi, 0.25 * m * (m - 1)) - for i in xrange(m): - res *= gamma(x - 0.5 * i) - return res + + K2 = np.float_power( + 2, self.alpha * self.size + 0.5 * self.size * (self.size + 1) + ) + for i in xrange(self.size): + K2 *= gamma(self.alpha + i + 1) + return K1 * K2 + + @lazyproperty + def size(self): + """int representing the size of the wishart matrix. + + If the original size is even, that this is the size that's returned. If the + size is odd, then the next even number is returned, because of the row/col + insertion, that happens as the part of the algorithm + """ + return self.n_min + (self.n_min % 2) + + +class Pfaffian: + """Implementation of the Pfaffian value object class.""" + + def __init__(self, wishart_cdf, chisq_ordinal): + self._wishart_cdf = wishart_cdf + self._chisq_val = chisq_ordinal + + @lazyproperty + def value(self): + """return float Cumulative Distribution Function. + + The return value represents a floating point number of the CDF of the + largest eigenvalue of a Wishart(n, p) evaluated at chisq_val. + """ + wishart = self._wishart_cdf + + # Prepare variables for integration algorithm + A = self.A + p = self._gammainc_a + g = gamma(wishart.alpha_vec) + q_ind = np.arange(2 * wishart.n_min - 2) + q_vec = 2 * wishart.alpha + q_ind + 2 + q = np.float_power(0.5, q_vec) * gamma(q_vec) * gammainc(q_vec, self._chisq_val) + + # Perform integration (i.e. calculate Pfaffian CDF) + for i in xrange(wishart.n_min): + # TODO consider index tricks instead of iteration here + b = 0.5 * p[i] * p[i] + for j in xrange(i, wishart.n_min - 1): + b -= q[i + j] / (g[i] * g[j + 1]) + A[j + 1, i] = p[i] * p[j + 1] - 2 * b + A[i, j + 1] = -A[j + 1, i] + return np.sqrt(det(A)) + + @lazyproperty + def A(self): + """ndarray - a skew-symmetric matrix for integrating the target distribution""" + wishart = self._wishart_cdf + + base = np.zeros([wishart.size, wishart.size]) + if wishart.n_min % 2: + # If matrix has odd number of elements, we need to append a + # row and a col, in order for the pfaffian algorithm to work + base = self._make_size_even(base) + return base + + @lazyproperty + def _gammainc_a(self): + return gammainc(self._wishart_cdf.alpha_vec, 0.5 * self._chisq_val) + + def _make_size_even(self, base): + wishart = self._wishart_cdf + alpha_ind = wishart.alpha_ind + other_ind = wishart.other_ind + alpha_last = wishart.alpha_last + + base[other_ind, alpha_ind] = ( + np.float_power(2, -alpha_last) * self._gammainc_a / gamma(alpha_last) + ) + base[alpha_ind, other_ind] = -base[other_ind, alpha_ind] + + return base diff --git a/src/cr/cube/measures/pairwise_pvalues.py b/src/cr/cube/measures/pairwise_pvalues.py index 32dd69ece..de1e19b1f 100644 --- a/src/cr/cube/measures/pairwise_pvalues.py +++ b/src/cr/cube/measures/pairwise_pvalues.py @@ -1,23 +1,94 @@ # encoding: utf-8 -"""P-values of pairwise comparison or columns of a contingency table -""" +"""P-values of pairwise comparison or columns of a contingency table.""" -from cr.cube.distributions.wishart import wishartCDF +from __future__ import division + +import numpy as np + +# from cr.cube.distributions.wishart import wishartCDF +from cr.cube.distributions.wishart import WishartCDF from cr.cube.util import lazyproperty +try: + xrange +except NameError: # pragma: no cover + # pylint: disable=invalid-name + xrange = range + -class PairwisePvalues(object): +# pylint: disable=too-few-public-methods +class PairwisePvalues: """Value object providing matrix of pairwise-comparison P-values""" - def __init__(self, chisq, shape, axis=0): - if axis != 0: - raise NotImplementedError("Pairwise p-values only implemented for colums") - self.axis = axis - self._chisq = chisq - self.n_max = max(shape) - 1 - self.n_min = min(shape) - 1 + def __init__(self, slice_, axis=0, weighted=True): + self._slice = slice_ + self._axis = axis + self._weighted = weighted @lazyproperty def values(self): - return 1.0 - wishartCDF(self._chisq, self.n_min, self.n_max) + """Square matrix of pairwise Chi-square along axis, as numpy.ndarray.""" + return 1.0 - WishartCDF(self._pairwise_chisq, self._n_min, self._n_max).values + + @lazyproperty + def _categorical_pairwise_chisq(self): + """Pairwise comparisons (Chi-Square) along axis, as numpy.ndarray. + + Returns a square, symmetric matrix of test statistics for the null + hypothesis that each vector along *axis* is equal to each other. + """ + chisq = np.zeros([self._numel, self._numel]) + for i in xrange(1, self._numel): + for j in xrange(0, self._numel - 1): + chisq[i, j] = chisq[j, i] = np.sum( + np.square(self._proportions[:, i] - self._proportions[:, j]) + / self._observed + ) / (1 / self._margin[i] + 1 / self._margin[j]) + + return chisq + + @lazyproperty + def _margin(self): + """Margin for the axis as numpy.ndarray.""" + return self._slice.margin(axis=self._axis) + + @lazyproperty + def _off_margin(self): + """Margin for the opposite axis as numpy.ndarray.""" + return self._slice.margin(axis=(1 - self._axis)) + + @lazyproperty + def _pairwise_chisq(self): + """Pairwise Chi-squared statistics along axis, as numpy.ndarray. + + Zscore is a measure of statistical significance of observed vs. + expected counts. It's only applicable to a 2D contingency tables. + """ + return self._categorical_pairwise_chisq + + @lazyproperty + def _proportions(self): + """Slice proportions for *axis* as numpy.ndarray.""" + return self._slice.proportions(axis=self._axis) + + @lazyproperty + def _n_max(self): + """Size (zero based) of the bigger of the two slice's dimension, as int.""" + return max(self._slice.get_shape()) - 1 + + @lazyproperty + def _n_min(self): + """Size (zero based) of the smaller of the two slice's dimension, as int.""" + return min(self._slice.get_shape()) - 1 + + @lazyproperty + def _numel(self): + """Number of elements of the dimension opposite to axis, as int.""" + return self._slice.get_shape()[1 - self._axis] + + @lazyproperty + def _observed(self): + """Observed marginal proportions, as float.""" + total = self._slice.margin() + return self._off_margin / total diff --git a/tests/integration/test_pairwise.py b/tests/integration/test_pairwise.py index a42c25cbc..a8babfed7 100644 --- a/tests/integration/test_pairwise.py +++ b/tests/integration/test_pairwise.py @@ -1,11 +1,17 @@ -from unittest import TestCase +# encoding: utf-8 +# pylint: disable=protected-access -import numpy as np +"""Integration tests for pairwise comparisons.""" + +from unittest import TestCase import pytest -from ..fixtures import CR +import numpy as np from cr.cube.crunch_cube import CrunchCube +from cr.cube.measures.pairwise_pvalues import PairwisePvalues + +from ..fixtures import CR # pylint: disable=missing-docstring, invalid-name, no-self-use @@ -15,13 +21,15 @@ class TestStandardizedResiduals(TestCase): def test_same_col_counts(self): """Test statistics for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) + pairwise_pvalues = PairwisePvalues(cube.slices[0], axis=0) expected = np.zeros([4, 4]) - actual = cube.pairwise_chisq(axis=0) + actual = pairwise_pvalues._pairwise_chisq np.testing.assert_equal(actual, expected) def test_hirotsu_chisq(self): """Test statistic for hirotsu data matches R""" cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) + pairwise_pvalues = PairwisePvalues(cube.slices[0], axis=0) expected = np.array( [ [ @@ -146,123 +154,129 @@ def test_hirotsu_chisq(self): ], ] ) - actual = cube.pairwise_chisq(axis=0) + actual = pairwise_pvalues._pairwise_chisq np.testing.assert_almost_equal(actual, expected) def test_same_col_pvals(self): """P-values for columns that are all the same.""" cube = CrunchCube(CR.SAME_COUNTS_3x4) - expected = np.ones([4, 4]) + expected = [np.ones([4, 4])] actual = cube.pairwise_pvals(axis=0) np.testing.assert_equal(actual, expected) + # Assert correct exception in case of not-implemented direction + with pytest.raises(NotImplementedError): + cube.pairwise_pvals(axis=1) + def test_hirotsu_pvals(self): cube = CrunchCube(CR.PAIRWISE_HIROTSU_ILLNESS_X_OCCUPATION) actual = cube.pairwise_pvals(axis=0) - expected = np.array( - [ - 1, - 0.999603716443816, - 0.99999993076784, - 0.435830186989942, - 0.171365670494448, - 0.999999931581745, - 0.999999979427862, - 0.726740806122402, - 0.999997338047414, - 0.105707739899106, - 0.999603716443816, - 1, - 0.999991395396033, - 0.806407150042716, - 0.380296648898666, - 0.999999999961875, - 0.999996333717649, - 0.773583582093158, - 0.999999999998836, - 0.192375246184738, - 0.99999993076784, - 0.999991395396033, - 1, - 0.017277623171216, - 3.29012189337341e-06, - 0.99999999999994, - 0.999998237045896, - 0.0365273119329589, - 0.999999857555538, - 2.23456306602809e-06, - 0.435830186989942, - 0.806407150042716, - 0.017277623171216, - 1, - 0.999999977981595, - 0.821586701043061, - 0.982573114952466, - 0.999999169027016, - 0.998041030837588, - 0.999934687968906, - 0.171365670494448, - 0.380296648898666, - 3.29012189337341e-06, - 0.999999977981595, - 1, - 0.52406354520284, - 0.926322806048378, - 0.99999998900118, - 0.981100354607917, - 0.999999991067971, - 0.999999931581745, - 0.999999999961875, - 0.99999999999994, - 0.821586701043061, - 0.52406354520284, - 1, - 0.999999992799126, - 0.883025655503086, - 0.99999999998941, - 0.33149560264078, - 0.999999979427862, - 0.999996333717649, - 0.999998237045896, - 0.982573114952466, - 0.926322806048378, - 0.999999992799126, - 1, - 0.997674862917282, - 0.99999999995011, - 0.81726901111794, - 0.726740806122402, - 0.773583582093158, - 0.0365273119329589, - 0.999999169027016, - 0.99999998900118, - 0.883025655503086, - 0.997674862917282, - 1, - 0.998461227115608, - 0.999994436499243, - 0.999997338047414, - 0.999999999998836, - 0.999999857555538, - 0.998041030837588, - 0.981100354607917, - 0.99999999998941, - 0.99999999995011, - 0.998461227115608, - 1, - 0.925999125959122, - 0.105707739899106, - 0.192375246184738, - 2.23456306602809e-06, - 0.999934687968906, - 0.999999991067971, - 0.33149560264078, - 0.81726901111794, - 0.999994436499243, - 0.925999125959122, - 1, - ] - ).reshape(10, 10) + expected = [ + np.array( + [ + 1, + 0.999603716443816, + 0.99999993076784, + 0.435830186989942, + 0.171365670494448, + 0.999999931581745, + 0.999999979427862, + 0.726740806122402, + 0.999997338047414, + 0.105707739899106, + 0.999603716443816, + 1, + 0.999991395396033, + 0.806407150042716, + 0.380296648898666, + 0.999999999961875, + 0.999996333717649, + 0.773583582093158, + 0.999999999998836, + 0.192375246184738, + 0.99999993076784, + 0.999991395396033, + 1, + 0.017277623171216, + 3.29012189337341e-06, + 0.99999999999994, + 0.999998237045896, + 0.0365273119329589, + 0.999999857555538, + 2.23456306602809e-06, + 0.435830186989942, + 0.806407150042716, + 0.017277623171216, + 1, + 0.999999977981595, + 0.821586701043061, + 0.982573114952466, + 0.999999169027016, + 0.998041030837588, + 0.999934687968906, + 0.171365670494448, + 0.380296648898666, + 3.29012189337341e-06, + 0.999999977981595, + 1, + 0.52406354520284, + 0.926322806048378, + 0.99999998900118, + 0.981100354607917, + 0.999999991067971, + 0.999999931581745, + 0.999999999961875, + 0.99999999999994, + 0.821586701043061, + 0.52406354520284, + 1, + 0.999999992799126, + 0.883025655503086, + 0.99999999998941, + 0.33149560264078, + 0.999999979427862, + 0.999996333717649, + 0.999998237045896, + 0.982573114952466, + 0.926322806048378, + 0.999999992799126, + 1, + 0.997674862917282, + 0.99999999995011, + 0.81726901111794, + 0.726740806122402, + 0.773583582093158, + 0.0365273119329589, + 0.999999169027016, + 0.99999998900118, + 0.883025655503086, + 0.997674862917282, + 1, + 0.998461227115608, + 0.999994436499243, + 0.999997338047414, + 0.999999999998836, + 0.999999857555538, + 0.998041030837588, + 0.981100354607917, + 0.99999999998941, + 0.99999999995011, + 0.998461227115608, + 1, + 0.925999125959122, + 0.105707739899106, + 0.192375246184738, + 2.23456306602809e-06, + 0.999934687968906, + 0.999999991067971, + 0.33149560264078, + 0.81726901111794, + 0.999994436499243, + 0.925999125959122, + 1, + ] + ).reshape(10, 10) + ] np.testing.assert_almost_equal(actual, expected) def test_odd_latent_dimensions(self): @@ -271,24 +285,26 @@ def test_odd_latent_dimensions(self): """ cube = CrunchCube(CR.PAIRWISE_4X4) actual = cube.pairwise_pvals(axis=0) - expected = np.array( - [ - 1, - 0.949690252544668, - 0.917559534718816, - 0.97630069244232, - 0.949690252544668, - 1, - 0.35511045473507, - 0.999999119644564, - 0.917559534718816, - 0.35511045473507, - 1, - 0.313869429972919, - 0.97630069244232, - 0.999999119644564, - 0.313869429972919, - 1, - ] - ).reshape(4, 4) + expected = [ + np.array( + [ + 1, + 0.949690252544668, + 0.917559534718816, + 0.97630069244232, + 0.949690252544668, + 1, + 0.35511045473507, + 0.999999119644564, + 0.917559534718816, + 0.35511045473507, + 1, + 0.313869429972919, + 0.97630069244232, + 0.999999119644564, + 0.313869429972919, + 1, + ] + ).reshape(4, 4) + ] np.testing.assert_almost_equal(actual, expected)