Crunch-io · ernestoarbitrio · Dec 5, 2019 · Nov 28, 2019 · Nov 28, 2019 · Dec 3, 2019
diff --git a/.gitignore b/.gitignore
@@ -84,6 +84,7 @@ celerybeat-schedule
 .env
 
 # virtualenv
+Pipfile
 .venv
 venv/
 venv3/

diff --git a/src/cr/cube/cube.py b/src/cr/cube/cube.py
@@ -268,6 +268,22 @@ def inflate(self):
             self._mask_size,
         )
 
+    @lazyproperty
+    def is_mr_by_itself(self):
+        """
+        It identify if the cube contains MRxItself as last 2 dimensions.
+
+        If the last 2 dimensions in cube (ndim>=3) are MR and they have
+        the same alias returns True
+        """
+        return (
+            True
+            if len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1
+            and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:])
+            and self.ndim >= 3
+            else False
+        )
+
     @lazyproperty
     def is_weighted(self):
         """True if cube response contains weighted data."""

diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -186,6 +186,10 @@ def inserted_row_idxs(self):
     def is_empty(self):
         return any(s == 0 for s in self.shape)
 
+    @lazyproperty
+    def cube_is_mr_by_itself(self):
+        return self._cube.is_mr_by_itself
+
     @lazyproperty
     def means(self):
         return np.array([row.means for row in self._matrix.rows])
@@ -202,6 +206,11 @@ def name(self):
         """
         return self.rows_dimension_name
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return self._matrix.overlaps_tstats
+        # return self._matrix.overlaps_tstats[self._slice_idx]
+
     @lazyproperty
     def pairwise_indices(self):
         alpha = self._transforms_dict.get("pairwise_indices", {}).get("alpha", 0.05)
@@ -458,6 +467,10 @@ def table_name(self):
 
         title = self._cube.name
         table_name = self._cube.dimensions[0].valid_elements[self._slice_idx].label
+
+        if self._cube.is_mr_by_itself:
+            return title
+
         return "%s: %s" % (title, table_name)
 
     @lazyproperty
@@ -562,6 +575,10 @@ def counts(self):
     def is_empty(self):
         return any(s == 0 for s in self._shape)
 
+    @lazyproperty
+    def cube_is_mr_by_itself(self):
+        return False
+
     @lazyproperty
     def inserted_row_idxs(self):
         # TODO: add integration-test coverage for this.
@@ -797,6 +814,10 @@ def base_count(self):
     def is_empty(self):
         return False if self.base_count else True
 
+    @lazyproperty
+    def cube_is_mr_by_itself(self):
+        return False
+
     @lazyproperty
     def means(self):
         return self._scalar.means

diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py
@@ -258,6 +258,10 @@ def __init__(self, dimension_dict, dimension_type, dimension_transforms=None):
         self._dimension_type = dimension_type
         self._dimension_transforms_arg = dimension_transforms
 
+    @lazyproperty
+    def alias(self):
+        return self._dimension_dict["references"]["alias"]
+
     @lazyproperty
     def all_elements(self):
         """_AllElements object providing cats or subvars of this dimension.

diff --git a/src/cr/cube/matrix.py b/src/cr/cube/matrix.py
@@ -15,7 +15,7 @@
 from scipy.stats.contingency import expected_freq
 
 from cr.cube.enum import DIMENSION_TYPE as DT
-from cr.cube.util import lazyproperty
+from cr.cube.util import lazyproperty, calculate_overlap_tstats
 
 
 class TransformedMatrix(object):
@@ -45,6 +45,14 @@ def rows(self):
             if not row.hidden
         )
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return (
+            self._unordered_matrix.overlaps_tstats
+            if self._unordered_matrix._is_cat_x_mr_x_itself
+            else None
+        )
+
     @lazyproperty
     def table_base(self):
         return self.table_base_unpruned[
@@ -218,7 +226,24 @@ def factory(cls, cube, dimensions, slice_idx):
         base_counts = cube.base_counts
         counts_with_missings = cube.counts_with_missings
         dimension_types = cube.dimension_types[-2:]
+        if cube.dimension_types == (DT.CAT, DT.MR, DT.MR) and cube.is_mr_by_itself:
 
+            overlap_tstats = calculate_overlap_tstats(
+                _MrXMrMatrix, dimensions, counts, base_counts, counts_with_missings
+            )
+
+            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
-
-            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
+            # These are apparent dimensions which hide 'selections' dims behind 'MR'
-
-            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
+            # These are apparent dimensions which hide 'selections' dims behind 'MR'
+            dimensions = cube.dimensions[:-1]
+            counts = np.sum(counts[:, :, :, 0], axis=3)
+            base_counts = np.sum(base_counts[:, :, :, 0], axis=3)
+            counts_with_missings = np.sum(counts_with_missings[:, :, :, 0], axis=3)
+            return _CatXMrMatrix(
+                dimensions,
+                counts,
+                base_counts,
+                counts_with_missings,
+                overlaps=overlap_tstats,
+            )
         # For cubes with means, create one of the means-matrix types
         if cube.has_means:
             if cube.ndim == 3:
@@ -562,13 +587,25 @@ class _CatXMrMatrix(_MatrixWithMR):
     (which correspond to the MR dimension).
     """
 
+    def __init__(
+        self, dimensions, counts, base_counts, counts_with_missings, overlaps=None
+    ):
+        super(_CatXMrMatrix, self).__init__(
+            dimensions, counts, base_counts, counts_with_missings
+        )
+        self._overlaps = overlaps
+
     @lazyproperty
     def columns(self):
         return tuple(
             _CatXMrVector(counts.T, base_counts.T, element, table_margin)
             for counts, base_counts, element, table_margin in self._column_generator
         )
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return self._overlaps if self._is_cat_x_mr_x_itself else None
+
     @lazyproperty
     def rows(self):
         return tuple(
@@ -613,6 +650,10 @@ def _baseline(self):
         dim_sum = np.sum(self._all_counts, axis=2)[self._valid_rows_idxs]
         return dim_sum / np.sum(dim_sum, axis=0)
 
+    @lazyproperty
+    def _is_cat_x_mr_x_itself(self):
+        return True if self._overlaps is not None else False
+
     @lazyproperty
     def _column_generator(self):
         return zip(
@@ -626,6 +667,8 @@ def _column_generator(self):
 
     @lazyproperty
     def _zscores(self):
+        # if the cube is a special one (5D with MRxItself as last dims)
+        # the zscores should be the same as a 2D MRxMR matrix
         return self._array_type_std_res(
             self._counts[:, :, 0],
             self.table_margin,
@@ -635,9 +678,11 @@ def _zscores(self):
 
 
 class _CatXMrMeansMatrix(_CatXMrMatrix):
-    def __init__(self, dimensions, means, base_counts):
+    def __init__(self, dimensions, means, base_counts, overlaps=None):
         counts = np.zeros(means.shape)
-        super(_CatXMrMeansMatrix, self).__init__(dimensions, counts, base_counts)
+        super(_CatXMrMeansMatrix, self).__init__(
+            dimensions, counts, base_counts, overlaps
+        )
         self._means = means
 
     @lazyproperty
@@ -667,6 +712,10 @@ def columns(self):
             for counts, base_counts, element, table_margin in self._column_generator
         )
 
+    @lazyproperty
+    def _mr_shadow_proportions(self):
-    def _mr_shadow_proportions(self):
+    def _mr_shadow_proportions(self):
+        """
+        Cube containing item-wise selections, overlap, and nonoverlap 
+        with all other items in a multiple response dimension, for each 
+        element of any prepended dimensions: 
+        A 1d interface to a 4d hypercube of underlying counts.
+        """
-    def _mr_shadow_proportions(self):
+    def _mr_shadow_proportions(self):
+        """
+        Cube containing item-wise selections, overlap, and nonoverlap 
+        with all other items in a multiple response dimension, for each 
+        element of any prepended dimensions: 
+        A 1d interface to a 4d hypercube of underlying counts.
+        """
+        return self._counts[:, 0, :, 0] / self._pairwise_overlap_total
+
     @lazyproperty
     def rows(self):
         return tuple(
@@ -725,6 +774,10 @@ def _column_generator(self):
             self.table_margin.T,
         )
 
+    @lazyproperty
+    def _pairwise_overlap_total(self):
-    def _pairwise_overlap_total(self):
+    def _pairwise_overlap_total(self):
+        """
+        Given a 4d hypercube of multiple response items, return the
+        symmetric square matrix of valid observations between all pairs.
+        n1 = 2; n2 = 2; n12 = 1; overlap total = 3
+        """
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
-    def _pairwise_overlap_total(self):
+    def _pairwise_overlap_total(self):
+        """
+        Given a 4d hypercube of multiple response items, return the
+        symmetric square matrix of valid observations between all pairs.
+        n1 = 2; n2 = 2; n12 = 1; overlap total = 3
+        """
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
     @lazyproperty
     def _row_generator(self):
         return zip(
@@ -745,6 +798,53 @@ def _zscores(self):
             np.sum(self._counts, axis=3)[:, 0, :],
         )
 
+    @lazyproperty
+    def tstats_overlap(self):
+        """
+        ndarray of correct tstats values considering the overlapped observations
+        t = (pi-pj)/s.e.(pi-pj)
+        where
+        s.e.(pi-pj) = sqrt(p_i*(1-p_i)/n_i+p_j*(1-p_j)/n_j-2*n_ij*(p_ij-p_i*p_j)/(n_i*n_j))
+        ni = base size for first subvar
+        nj = base size for second subvar
+        nij = number of overlapping observations
+        pij = proportion for which both subvar are True (selected)
+        In this case MRxMR the diff pi-pj is the pairwise subtraction of the diagonal of the
+        shadow_proportions the denominator is the matrix containing the unweighted counts
+        of the cube
+        """
+
+        # Subtraction of the proportions foreach observation
+        diff = np.subtract.outer(
+            self._mr_shadow_proportions.diagonal(),
+            self._mr_shadow_proportions.diagonal(),
+        )
+        # Sum of the s.e. for each observation
+        se_pi_pj = np.add.outer(
+            self._mr_shadow_proportions.diagonal()
+            * (1 - self._mr_shadow_proportions.diagonal())
+            / self.table_base.diagonal(),
+            self._mr_shadow_proportions.diagonal()
+            * (1 - self._mr_shadow_proportions.diagonal())
+            / self.table_base.diagonal(),
+        )
+        # Correction factor considering the overlap
+        correction_factor = (
+            2
+            * self.table_base
+            * (
+                self._mr_shadow_proportions
+                - np.multiply.outer(
+                    self._mr_shadow_proportions.diagonal(),
+                    self._mr_shadow_proportions.diagonal(),
+                )
+            )
+        ) / np.multiply.outer(self.table_base.diagonal(), self.table_base.diagonal())
+        se_diff = np.sqrt(se_pi_pj - correction_factor)
+        t_stats = diff / se_diff
+        np.fill_diagonal(t_stats, 0)
+        return t_stats
+
 
 # ===INSERTION (SUBTOTAL) VECTORS===
 

diff --git a/src/cr/cube/measures/new_pairwise_significance.py b/src/cr/cube/measures/new_pairwise_significance.py
@@ -78,8 +78,11 @@ def t_stats(self):
         return diff / se_diff
 
     @lazyproperty
-    def t_stats_scale_means(self):
+    def t_stats_correct(self):
+        return self._slice.overlaps_tstats[:, self._col_idx, :]
 
+    @lazyproperty
+    def t_stats_scale_means(self):
         """
         This property calculates the Two-tailed t-test using the formula:
         t = X1 - X2 / Sx1x2 * sqrt(1/n1 + 1/n2)
@@ -119,6 +122,10 @@ def p_vals_scale_means(self):
 
     @lazyproperty
     def p_vals(self):
+        # if the cube to which the slice belongs is a CATxMRxITSELF
+        # returns the pvals using the t_stats_correct values
+        if self._slice.cube_is_mr_by_itself:
+            return 2 * (1 - t.cdf(abs(self.t_stats_correct), df=self._df))
         return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df))
 
     @lazyproperty
@@ -167,11 +174,15 @@ def summary_p_vals(self):
 
     @lazyproperty
     def _df(self):
+        # if the cube to which the slice belongs is a CATxMRxITSELF
+        # returns the n1 + n2 as degrees of freedom, n1 + n2 -2 otherwise
         selected_unweighted_n = (
             self._slice.column_base[self._col_idx]
             if self._slice.column_base.ndim < 2
             else self._slice.column_base[:, self._col_idx][:, None]
         )
+        if self._slice.cube_is_mr_by_itself:
+            return self._slice.column_base + selected_unweighted_n
         return self._slice.column_base + selected_unweighted_n - 2
 
     @lazyproperty

diff --git a/src/cr/cube/util.py b/src/cr/cube/util.py
@@ -13,6 +13,21 @@
     from itertools import filterfalse as ifilterfalse
 
 
+def calculate_overlap_tstats(
+    cls, mr_dimensions, mr_counts, mr_base_counts, mr_counts_with_missings
+):
+    overlaps = np.zeros(np.array(mr_counts.shape)[[0, 1, 3]])
+    for slice_index in range(mr_counts.shape[0]):
+        overlap_slice = cls(
+            mr_dimensions,
+            mr_counts[slice_index],
+            mr_base_counts[slice_index],
+            mr_counts_with_missings[slice_index],
+        )
+        overlaps[slice_index] = overlap_slice.tstats_overlap
+    return overlaps
+
+
 def compress_pruned(table):
     """Compress table based on pruning mask.