Crunch-io · ernestoarbitrio · Dec 5, 2019 · Nov 28, 2019 · Nov 28, 2019 · Dec 3, 2019
diff --git a/.gitignore b/.gitignore
@@ -84,6 +84,7 @@ celerybeat-schedule
 .env
 
 # virtualenv
+Pipfile
 .venv
 venv/
 venv3/

diff --git a/src/cr/cube/cube.py b/src/cr/cube/cube.py
@@ -268,6 +268,22 @@ def inflate(self):
             self._mask_size,
         )
 
+    @lazyproperty
+    def is_mr_by_itself(self):
+        """It identify if the cube contains MRxItself as last 2 dimensions.
+
+        If the last 2 dimensions in cube (ndim>=3) are MR and they have
+        the same alias returns True
+        """
+        return (
+            # ---there are at least three dimensions---
+            self.ndim >= 3
+            # ---the last two are both MR---
+            and all(dim_type == DT.MR for dim_type in self.dimension_types[-2:])
+            # ---and they both have the same alias---
+            and len(set([dimension.alias for dimension in self.dimensions[-2:]])) == 1
+        )
+
     @lazyproperty
     def is_weighted(self):
         """True if cube response contains weighted data."""

diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -57,9 +57,8 @@ def factory(
         return _Slice(cube, slice_idx, transforms, population, mask_size)
 
     @lazyproperty
-    def variable_name(self):
-        """str representing the name of the superheading variable."""
-        return self._dimensions[0 if self.ndim < 2 else 1].name
+    def cube_is_mr_by_itself(self):
+        return False
 
     @lazyproperty
     def dimension_types(self):
@@ -97,6 +96,11 @@ def shape(self):
             "must be implemented by each subclass"
         )  # pragma: no cover
 
+    @lazyproperty
+    def variable_name(self):
+        """str representing the name of the superheading variable."""
+        return self._dimensions[0 if self.ndim < 2 else 1].name
+
 
 class _Slice(CubePartition):
     """2D cube partition.
@@ -167,6 +171,10 @@ def columns_margin(self):
     def counts(self):
         return np.array([row.values for row in self._matrix.rows])
 
+    @lazyproperty
+    def cube_is_mr_by_itself(self):
+        return self._cube.is_mr_by_itself
+
     @lazyproperty
     def description(self):
         """str description of this slice, which it takes from its rows-dimension."""
@@ -182,6 +190,35 @@ def inserted_column_idxs(self):
     def inserted_row_idxs(self):
         return tuple(i for i, row in enumerate(self._matrix.rows) if row.is_insertion)
 
+    @lazyproperty
+    def insertions(self):
+        """Returns masked array with residuals for insertions
+
+                 0     1	 2	     3	    4	    5	    6
+           0   inf   inf   inf	   inf	  inf	 -2.9	  inf
+           1   inf	 inf   inf	   inf	  inf	 -4.3	  inf
+           2   2.5	 1.3   3.3	 -0.70	-7.25	 -6.52	 2.25
+           3   inf	 inf   inf	   inf	  inf	 -2.51	  inf
+           4  -1.16	 2.20  5.84	  1.78	-8.48	 -5.92	 0.93
+           5   inf   inf   inf	   inf	  inf	  9.70	  inf
+
+           Only the insertions residuals are showed in a inf masked array
+        """
+        inserted_rows = self.inserted_row_idxs
+        inserted_cols = self.inserted_column_idxs
+        if not inserted_cols and not inserted_cols:
+            return []
+        mask = np.zeros(self.pvals.shape)
+        mask[inserted_rows, :] = 1
+        mask[:, inserted_cols] = 1
+        masked_pvals = np.ma.masked_array(self.pvals, np.logical_not(mask)).filled(
+            np.inf
+        )
+        masked_zscores = np.ma.masked_array(self.zscore, np.logical_not(mask)).filled(
+            np.inf
+        )
+        return np.stack([masked_pvals, masked_zscores])
+
     @lazyproperty
     def is_empty(self):
         return any(s == 0 for s in self.shape)
@@ -202,6 +239,10 @@ def name(self):
         """
         return self.rows_dimension_name
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return self._matrix.overlaps_tstats
+
     @lazyproperty
     def pairwise_indices(self):
         alpha = self._transforms_dict.get("pairwise_indices", {}).get("alpha", 0.05)
@@ -245,6 +286,14 @@ def population_counts(self):
     def pvals(self):
         return np.array([row.pvals for row in self._matrix.rows])
 
+    @lazyproperty
+    def residual_test_stats(self):
+        """Exposes pvals and zscore (with HS) stacked together
+
+        Public method used as cube_method for the SOA API
+        """
+        return np.stack([self.pvals, self.zscore])
+
     @lazyproperty
     def row_base(self):
         return np.array([row.base for row in self._matrix.rows])
@@ -458,6 +507,10 @@ def table_name(self):
 
         title = self._cube.name
         table_name = self._cube.dimensions[0].valid_elements[self._slice_idx].label
+
+        if self._cube.is_mr_by_itself:
+            return title
+
         return "%s: %s" % (title, table_name)
 
     @lazyproperty

diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py
@@ -258,6 +258,15 @@ def __init__(self, dimension_dict, dimension_type, dimension_transforms=None):
         self._dimension_type = dimension_type
         self._dimension_transforms_arg = dimension_transforms
 
+    @lazyproperty
+    def alias(self):
+        """Return the alias for the dimension if it exists, None otherwise
+
+        This property is needed to identify one of the mandatory condition
+        for a MRxItself cube.
+        """
+        return self._dimension_dict["references"].get("alias", None)
+
     @lazyproperty
     def all_elements(self):
         """_AllElements object providing cats or subvars of this dimension.

diff --git a/src/cr/cube/matrix.py b/src/cr/cube/matrix.py
@@ -15,7 +15,7 @@
 from scipy.stats.contingency import expected_freq
 
 from cr.cube.enum import DIMENSION_TYPE as DT
-from cr.cube.util import lazyproperty
+from cr.cube.util import lazyproperty, calculate_overlap_tstats
 
 
 class TransformedMatrix(object):
@@ -37,6 +37,14 @@ def columns(self):
             if not column.hidden
         )
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return (
+            self._unordered_matrix.overlaps_tstats
+            if self._unordered_matrix._is_cat_x_mr_x_itself
+            else None
+        )
+
     @lazyproperty
     def rows(self):
         return tuple(
@@ -218,7 +226,24 @@ def factory(cls, cube, dimensions, slice_idx):
         base_counts = cube.base_counts
         counts_with_missings = cube.counts_with_missings
         dimension_types = cube.dimension_types[-2:]
+        if cube.dimension_types == (DT.CAT, DT.MR, DT.MR) and cube.is_mr_by_itself:
 
+            overlap_tstats = calculate_overlap_tstats(
+                _MrXMrMatrix, dimensions, counts, base_counts, counts_with_missings
+            )
+
+            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
-
-            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
+            # These are apparent dimensions which hide 'selections' dims behind 'MR'
-
-            # These are apparent dimensions (user dimensions). Do we need to get all the dims?
+            # These are apparent dimensions which hide 'selections' dims behind 'MR'
+            dimensions = cube.dimensions[:-1]
+            counts = np.sum(counts[:, :, :, 0], axis=3)
+            base_counts = np.sum(base_counts[:, :, :, 0], axis=3)
+            counts_with_missings = np.sum(counts_with_missings[:, :, :, 0], axis=3)
+            return _CatXMrMatrix(
+                dimensions,
+                counts,
+                base_counts,
+                counts_with_missings,
+                overlaps=overlap_tstats,
+            )
         # For cubes with means, create one of the means-matrix types
         if cube.has_means:
             if cube.ndim == 3:
@@ -562,13 +587,25 @@ class _CatXMrMatrix(_MatrixWithMR):
     (which correspond to the MR dimension).
     """
 
+    def __init__(
+        self, dimensions, counts, base_counts, counts_with_missings, overlaps=None
+    ):
+        super(_CatXMrMatrix, self).__init__(
+            dimensions, counts, base_counts, counts_with_missings
+        )
+        self._overlaps = overlaps
+
     @lazyproperty
     def columns(self):
         return tuple(
             _CatXMrVector(counts.T, base_counts.T, element, table_margin)
             for counts, base_counts, element, table_margin in self._column_generator
         )
 
+    @lazyproperty
+    def overlaps_tstats(self):
+        return self._overlaps if self._is_cat_x_mr_x_itself else None
+
     @lazyproperty
     def rows(self):
         return tuple(
@@ -624,8 +661,14 @@ def _column_generator(self):
             self.table_margin,
         )
 
+    @lazyproperty
+    def _is_cat_x_mr_x_itself(self):
+        return True if self._overlaps is not None else False
+
     @lazyproperty
     def _zscores(self):
+        # if the cube is a special one (5D with MRxItself as last dims)
+        # the zscores should be the same as a 2D MRxMR matrix
         return self._array_type_std_res(
             self._counts[:, :, 0],
             self.table_margin,
@@ -635,9 +678,11 @@ def _zscores(self):
 
 
 class _CatXMrMeansMatrix(_CatXMrMatrix):
-    def __init__(self, dimensions, means, base_counts):
+    def __init__(self, dimensions, means, base_counts, overlaps=None):
         counts = np.zeros(means.shape)
-        super(_CatXMrMeansMatrix, self).__init__(dimensions, counts, base_counts)
+        super(_CatXMrMeansMatrix, self).__init__(
+            dimensions, counts, base_counts, overlaps
+        )
         self._means = means
 
     @lazyproperty
@@ -667,6 +712,10 @@ def columns(self):
             for counts, base_counts, element, table_margin in self._column_generator
         )
 
+    @lazyproperty
+    def _mr_shadow_proportions(self):
-    def _mr_shadow_proportions(self):
+    def _mr_shadow_proportions(self):
+        """
+        Cube containing item-wise selections, overlap, and nonoverlap 
+        with all other items in a multiple response dimension, for each 
+        element of any prepended dimensions: 
+        A 1d interface to a 4d hypercube of underlying counts.
+        """
-    def _mr_shadow_proportions(self):
+    def _mr_shadow_proportions(self):
+        """
+        Cube containing item-wise selections, overlap, and nonoverlap 
+        with all other items in a multiple response dimension, for each 
+        element of any prepended dimensions: 
+        A 1d interface to a 4d hypercube of underlying counts.
+        """
+        return self._counts[:, 0, :, 0] / self._pairwise_overlap_total
+
     @lazyproperty
     def rows(self):
         return tuple(
@@ -696,6 +745,53 @@ def table_base(self):
     def table_margin(self):
         return np.sum(self._counts, axis=(1, 3))
 
+    @lazyproperty
+    def tstats_overlap(self):
+        """
+        ndarray of correct tstats values considering the overlapped observations
+        t = (pi-pj)/s.e.(pi-pj)
+        where
+        s.e.(pi-pj) = sqrt(p_i*(1-p_i)/n_i+p_j*(1-p_j)/n_j-2*n_ij*(p_ij-p_i*p_j)/(n_i*n_j))
+        ni = base size for first subvar
+        nj = base size for second subvar
+        nij = number of overlapping observations
+        pij = proportion for which both subvar are True (selected)
+        In this case MRxMR the diff pi-pj is the pairwise subtraction of the diagonal of the
+        shadow_proportions the denominator is the matrix containing the unweighted counts
+        of the cube
+        """
+
+        # Subtraction of the proportions foreach observation
+        diff = np.subtract.outer(
+            self._mr_shadow_proportions.diagonal(),
+            self._mr_shadow_proportions.diagonal(),
+        )
+        # Sum of the s.e. for each observation
+        se_pi_pj = np.add.outer(
+            self._mr_shadow_proportions.diagonal()
+            * (1 - self._mr_shadow_proportions.diagonal())
+            / self.table_base.diagonal(),
+            self._mr_shadow_proportions.diagonal()
+            * (1 - self._mr_shadow_proportions.diagonal())
+            / self.table_base.diagonal(),
+        )
+        # Correction factor considering the overlap
+        correction_factor = (
+            2
+            * self.table_base
+            * (
+                self._mr_shadow_proportions
+                - np.multiply.outer(
+                    self._mr_shadow_proportions.diagonal(),
+                    self._mr_shadow_proportions.diagonal(),
+                )
+            )
+        ) / np.multiply.outer(self.table_base.diagonal(), self.table_base.diagonal())
+        se_diff = np.sqrt(se_pi_pj - correction_factor)
+        t_stats = diff / se_diff
+        np.fill_diagonal(t_stats, 0)
+        return t_stats
+
     @lazyproperty
     def _baseline(self):
         """ndarray of baseline values for column index.
@@ -725,6 +821,10 @@ def _column_generator(self):
             self.table_margin.T,
         )
 
+    @lazyproperty
+    def _pairwise_overlap_total(self):
-    def _pairwise_overlap_total(self):
+    def _pairwise_overlap_total(self):
+        """
+        Given a 4d hypercube of multiple response items, return the
+        symmetric square matrix of valid observations between all pairs.
+        n1 = 2; n2 = 2; n12 = 1; overlap total = 3
+        """
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
-    def _pairwise_overlap_total(self):
+    def _pairwise_overlap_total(self):
+        """
+        Given a 4d hypercube of multiple response items, return the
+        symmetric square matrix of valid observations between all pairs.
+        n1 = 2; n2 = 2; n12 = 1; overlap total = 3
+        """
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
+        return np.sum(np.sum(self._counts, axis=1), axis=2)
+
     @lazyproperty
     def _row_generator(self):
         return zip(

diff --git a/src/cr/cube/measures/new_pairwise_significance.py b/src/cr/cube/measures/new_pairwise_significance.py
@@ -78,8 +78,11 @@ def t_stats(self):
         return diff / se_diff
 
     @lazyproperty
-    def t_stats_scale_means(self):
+    def t_stats_correct(self):
+        return self._slice.overlaps_tstats[:, self._col_idx, :]
 
+    @lazyproperty
+    def t_stats_scale_means(self):
         """
         This property calculates the Two-tailed t-test using the formula:
         t = X1 - X2 / Sx1x2 * sqrt(1/n1 + 1/n2)
@@ -119,6 +122,10 @@ def p_vals_scale_means(self):
 
     @lazyproperty
     def p_vals(self):
+        # if the cube to which the slice belongs is a CATxMRxITSELF
+        # returns the pvals using the t_stats_correct values
+        if self._slice.cube_is_mr_by_itself:
+            return 2 * (1 - t.cdf(abs(self.t_stats_correct), df=self._df))
         return 2 * (1 - t.cdf(abs(self.t_stats), df=self._df))
 
     @lazyproperty
@@ -167,11 +174,15 @@ def summary_p_vals(self):
 
     @lazyproperty
     def _df(self):
+        # if the cube to which the slice belongs is a CATxMRxITSELF
+        # returns the n1 + n2 as degrees of freedom, n1 + n2 -2 otherwise
         selected_unweighted_n = (
             self._slice.column_base[self._col_idx]
             if self._slice.column_base.ndim < 2
             else self._slice.column_base[:, self._col_idx][:, None]
         )
+        if self._slice.cube_is_mr_by_itself:
+            return self._slice.column_base + selected_unweighted_n
         return self._slice.column_base + selected_unweighted_n - 2
 
     @lazyproperty

diff --git a/src/cr/cube/util.py b/src/cr/cube/util.py
@@ -13,6 +13,21 @@
     from itertools import filterfalse as ifilterfalse
 
 
+def calculate_overlap_tstats(
+    cls, mr_dimensions, mr_counts, mr_base_counts, mr_counts_with_missings
+):
+    overlaps = np.zeros(np.array(mr_counts.shape)[[0, 1, 3]])
+    for slice_index in range(mr_counts.shape[0]):
+        overlap_slice = cls(
+            mr_dimensions,
+            mr_counts[slice_index],
+            mr_base_counts[slice_index],
+            mr_counts_with_missings[slice_index],
+        )
+        overlaps[slice_index] = overlap_slice.tstats_overlap
+    return overlaps
+
+
 def compress_pruned(table):
     """Compress table based on pruning mask.