Merge b19c139 into 9745bb1

Crunch-io · Nov 13, 2020 · 77deb7e · 77deb7e
2 parents 9745bb1 + b19c139
commit 77deb7e
Show file tree

Hide file tree

Showing 22 changed files with 557 additions and 386 deletions.
diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -298,31 +298,31 @@ def columns_margin(self):
         return np.array([column.margin for column in self._matrix.columns]).T
 
     @lazyproperty
-    def columns_percentages_moe(self):
-        """1D/2D np.float64 ndarray of margin-of-error (MoE) for columns percentages.
+    def column_proportions_moe(self):
+        """1D/2D np.float64 ndarray of margin-of-error (MoE) for columns proportions.
 
-        The values are represented as percentages, analogue to the `table_percentages`
-        property. This means that the value of 3.5% will have the value 3.5 (not 0.035).
+        The values are represented as fractions, analogue to the `column_proportions`
+        property. This means that the value of 3.5% will have the value 0.035.
         The values can be np.nan when the corresponding percentage is also np.nan, which
         happens when the respective columns margin is 0.
         """
-        return Z_975 * 100 * self.columns_std_err
+        return Z_975 * self.column_std_err
 
     @lazyproperty
-    def columns_std_dev(self):
-        """standard deviation for column percentages
+    def column_std_err(self):
+        """standard error for column percentages
 
-        `std_deviation = sqrt(variance)`
+        `std_error = sqrt(variance/N)`
         """
-        return np.sqrt(self._columns_variance)
+        return np.sqrt(self._column_variance / self.columns_margin)
 
     @lazyproperty
-    def columns_std_err(self):
-        """standard error for column percentages
+    def column_std_dev(self):
+        """standard deviation for column percentages
 
-        `std_error = sqrt(variance/N)`
+        `std_deviation = sqrt(variance)`
         """
-        return np.sqrt(self._columns_variance / self.columns_margin)
+        return np.sqrt(self._column_variance)
 
     @lazyproperty
     def counts(self):
@@ -459,6 +459,19 @@ def population_counts(self):
             self.table_proportions * self._population * self._cube.population_fraction
         )
 
+    @lazyproperty
+    def population_moe(self):
+        """2D np.float64 ndarray of population margin-of-error (MoE) for table percents.
+
+        The values are represented as population estimates, analogue to the
+        `population_counts` property. This means that the values will be presented by
+        actual estimated counts of the population. The values can be np.nan when the
+        corresponding percentage is also np.nan, which happens when the respective
+        table margin is 0.
+        """
+        total_filtered_population = self._population * self._cube.population_fraction
+        return Z_975 * total_filtered_population * self.table_std_err
+
     @lazyproperty
     def pvals(self):
         return np.array([row.pvals for row in self._matrix.rows])
@@ -526,6 +539,29 @@ def rows_dimension_type(self):
     def rows_margin(self):
         return np.array([row.margin for row in self._matrix.rows])
 
+    @lazyproperty
+    def row_proportions_moe(self):
+        """2D np.float64 ndarray of margin-of-error (MoE) for rows proportions.
+
+        The values are represented as percentage-fractions, analogue to the
+        `row_proportions` property. This means that the value of 3.5% will have the
+        value 0.035. The values can be np.nan when the corresponding percentage is also
+        np.nan, which happens when the respective table margin is 0.
+        """
+        return Z_975 * self.row_std_err
+
+    @lazyproperty
+    def row_std_dev(self):
+        """2D np.float64 ndarray of standard deviation for row percentages."""
+        return np.sqrt(self._row_variance)
+
+    @lazyproperty
+    def row_std_err(self):
+        """2D np.float64 ndarray of standard errors for row percentages."""
+        # --- We need to add `np.newaxis` to cast the rows margin vector to an actual
+        # --- column, in NumPy terms, to be able to devide correctly.
+        return np.sqrt(self._row_variance / self.rows_margin[:, np.newaxis])
+
     @lazyproperty
     def scale_mean_pairwise_indices(self):
         """Sequence of column-idx tuples indicating pairwise-t result of scale-means.
@@ -802,15 +838,15 @@ def table_percentages(self):
         return self.table_proportions * 100
 
     @lazyproperty
-    def table_percentages_moe(self):
-        """1D/2D np.float64 ndarray of margin-of-error (MoE) for table percentages.
+    def table_proportions_moe(self):
+        """1D/2D np.float64 ndarray of margin-of-error (MoE) for table proportions.
 
-        The values are represented as percentages, analogue to the `table_percentages`
-        property. This means that the value of 3.5% will have the value 3.5 (not 0.035).
-        The values can be np.nan when the corresponding percentage is also np.nan, which
+        The values are represented as fractions, analogue to the `table_proportions`
+        property. This means that the value of 3.5% will have the value 0.035.  The
+        values can be np.nan when the corresponding percentage is also np.nan, which
         happens when the respective table margin is 0.
         """
-        return Z_975 * 100 * self.table_std_err
+        return Z_975 * self.table_std_err
 
     @lazyproperty
     def table_proportions(self):
@@ -891,7 +927,7 @@ def _columns_dimension_numeric_values(self):
         return np.array([column.numeric_value for column in self._matrix.columns])
 
     @lazyproperty
-    def _columns_variance(self):
+    def _column_variance(self):
         """variance for column percentages
 
         `variance = p * (1-p)`
@@ -900,6 +936,16 @@ def _columns_variance(self):
             self.counts / self.columns_margin * (1 - self.counts / self.columns_margin)
         )
 
+    @lazyproperty
+    def _row_variance(self):
+        """ndarray of variances for row percentages"""
+        # --- Rows margin is a vector, that's supposed to represent a column (to the
+        # --- right of the crosstab). We need to devide all values in the crosstab by it
+        # --- and therefore need to cast it to an actual column (because of how NumPy
+        # --- does broadcasting).
+        margin = self.rows_margin[:, np.newaxis]
+        return self.counts / margin * (1 - self.counts / margin)
+
     @lazyproperty
     def _dimensions(self):
         """tuple of (rows_dimension, columns_dimension) Dimension objects."""
@@ -1006,6 +1052,19 @@ def population_counts(self):
             * self._cube.population_fraction
         )
 
+    @lazyproperty
+    def population_moe(self):
+        """1D np.float64 ndarray of population margin-of-error (MoE) for table percents.
+
+        The values are represented as population estimates, analogue to the
+        `population_counts` property. This means that the values will be presented by
+        actual estimated counts of the population The values can be np.nan when the
+        corresponding percentage is also np.nan, which happens when the respective
+        table margin is 0.
+        """
+        total_filtered_population = self._population * self._cube.population_fraction
+        return Z_975 * total_filtered_population * self.standard_error
+
     @lazyproperty
     def row_base(self):
         return np.array([row.base for row in self._stripe.rows])
@@ -1127,15 +1186,15 @@ def standard_error(self):
         return np.sqrt(self._variance / np.sum(self.rows_margin))
 
     @lazyproperty
-    def table_percentages_moe(self):
-        """1D np.float64 ndarray of margin-of-error (MoE) for table percentages.
+    def table_proportions_moe(self):
+        """1D np.float64 ndarray of margin-of-error (MoE) for table proportions.
 
-        The values are represented as percentages, analogue to the `table_percentages`
-        property. This means that the value of 3.5% will have the value 3.5 (not 0.035).
-        The values can be np.nan when the corresponding percentage is also np.nan, which
+        The values are represented as fractions, analogue to the `table_proportions`
+        property. This means that the value of 3.5% will have the value 0.035. The
+        values can be np.nan when the corresponding proportion is also np.nan, which
         happens when the respective columns margin is 0.
         """
-        return Z_975 * 100 * self.standard_error
+        return Z_975 * self.standard_error
 
     @lazyproperty
     def table_base(self):

diff --git a/tests/expectations/admit-x-dept-unweighted-col-prop-moe.py b/tests/expectations/admit-x-dept-unweighted-col-prop-moe.py
@@ -0,0 +1,18 @@
+[
+    [
+        0.0307207565,
+        0.0390691882,
+        0.030869894,
+        0.0329827837,
+        0.0351988285,
+        0.0180081013,
+    ],
+    [
+        0.0307207565,
+        0.0390691882,
+        0.030869894,
+        0.0329827837,
+        0.0351988285,
+        0.0180081013,
+    ],
+]
diff --git a/tests/expectations/cat-hs-x-mr-col-moe.py b/tests/expectations/cat-hs-x-mr-col-moe.py
@@ -1,10 +1,10 @@
 [
-    [17.30181459, 7.76167031, 2.58919072, 1.96696835, 2.47551804],
-    [13.7464937, 8.90617586, 6.33244289, 3.30105103, 3.99757694],
-    [16.79140176, 10.72624566, 6.68369737, 3.71703613, 4.42265749],
+    [0.17301815, 0.0776167, 0.02589191, 0.01966968, 0.02475518],
+    [0.13746494, 0.08906176, 0.06332443, 0.03301051, 0.03997577],
+    [0.16791402, 0.10726246, 0.06683697, 0.03717036, 0.04422657],
     [0.0, 0.0, 0.0, 0.0, 0.0],
-    [11.79621344, 11.12694011, 8.58251349, 4.91075221, 4.79812657],
-    [14.76383504, 11.06914477, 8.61521467, 5.11357472, 4.93988229],
+    [0.11796213, 0.1112694, 0.08582513, 0.04910752, 0.04798127],
+    [0.14763835, 0.11069145, 0.08615215, 0.05113575, 0.04939882],
     [0.0, 0.0, 0.0, 0.0, 0.0],
-    [16.79140176, 10.72624566, 6.68369737, 3.71703613, 4.42265749],
+    [0.16791402, 0.10726246, 0.06683697, 0.03717036, 0.04422657],
 ]
diff --git a/tests/expectations/cat-x-cat-pruning-hs-col-prop-moe.py b/tests/expectations/cat-x-cat-pruning-hs-col-prop-moe.py
@@ -0,0 +1,30 @@
+[
+    [
+        0.1351426726,
+        0.1079256616,
+        0.1659188199,
+        0.2248817088,
+        float("NaN"),
+        0.5331123764,
+    ],
+    [
+        0.1289084933,
+        0.104849174,
+        0.163342716,
+        0.2223457567,
+        float("NaN"),
+        0.5331123764,
+    ],
+    [0.0539900809, 0.0284154374, 0.0, 0.0993830606, float("NaN"), 0.0],
+    [
+        0.0899374047,
+        0.0709061236,
+        0.1108349041,
+        0.1403617724,
+        float("NaN"),
+        0.5331123764,
+    ],
+    [0.0899374047, 0.0857336265, 0.1452191695, 0.1975240786, float("NaN"), 0.0],
+    [0.0, 0.0, 0.0, 0.0, float("NaN"), 0.0],
+    [0.0524385075, 0.0387372796, 0.0573464003, 0.0993830606, float("NaN"), 0.0],
+]
diff --git a/tests/expectations/cat-x-cat-pruning-hs-table-prop-moe.py b/tests/expectations/cat-x-cat-pruning-hs-table-prop-moe.py
@@ -0,0 +1,30 @@
+[
+    [
+        0.0947425342,
+        0.1023031735,
+        0.0851416179,
+        0.064011466,
+        0.0,
+        0.0210138857,
+    ],
+    [
+        0.0548548061,
+        0.083257033,
+        0.0691062699,
+        0.0577563348,
+        0.0,
+        0.030001713,
+    ],
+    [0.0216468583, 0.0216468583, 0.0, 0.0210138857, 0.0, 0.0],
+    [
+        0.0367141987,
+        0.054641654,
+        0.0419511734,
+        0.0304381859,
+        0.0,
+        0.030001713,
+    ],
+    [0.0367141987, 0.0666431728, 0.0577563348, 0.046236001, 0.0, 0.0],
+    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+    [0.0210138857, 0.0295581821, 0.0210138857, 0.0210138857, 0.0, 0.0],
+]
diff --git a/tests/expectations/col-per-moe-cat-x-cat-hs-2rows-1col.py b/tests/expectations/col-per-moe-cat-x-cat-hs-2rows-1col.py
@@ -1,40 +1,8 @@
 [
-    [
-        13.03595844,
-        7.67698551,
-        3.46251469,
-        4.55693081,
-        4.13969905,
-        3.06644326,
-        7.58177966,
-    ],
-    [
-        9.31746956,
-        8.36644659,
-        3.78951977,
-        5.23042895,
-        3.72360922,
-        3.15148999,
-        7.65643283,
-    ],
-    [11.77008734, 8.47930382, 3.85500973, 5.5463129, 4.8153303, 3.66939254, 7.5418196],
-    [6.0015905, 7.16459682, 3.25399504, 4.39795907, 3.1556904, 2.63154691, 6.03640099],
-    [
-        10.57125967,
-        8.64082889,
-        3.91804373,
-        5.56024488,
-        4.45804303,
-        3.59253748,
-        8.05245981,
-    ],
-    [
-        10.91512996,
-        6.50723624,
-        2.9825236,
-        4.90998204,
-        4.89378128,
-        3.57587294,
-        5.83679508,
-    ],
+    [0.13035958, 0.07676986, 0.03462515, 0.04556931, 0.04139699, 0.03066443, 0.0758178],
+    [0.0931747, 0.08366447, 0.0378952, 0.05230429, 0.03723609, 0.0315149, 0.07656433],
+    [0.11770087, 0.08479304, 0.0385501, 0.05546313, 0.0481533, 0.03669393, 0.0754182],
+    [0.06001591, 0.07164597, 0.03253995, 0.04397959, 0.0315569, 0.02631547, 0.06036401],
+    [0.1057126, 0.08640829, 0.03918044, 0.05560245, 0.04458043, 0.03592537, 0.0805246],
+    [0.1091513, 0.06507236, 0.02982524, 0.04909982, 0.04893781, 0.03575873, 0.05836795],
 ]
diff --git a/tests/expectations/econ-gender-x-ideology-weighted-col-prop-moe.py b/tests/expectations/econ-gender-x-ideology-weighted-col-prop-moe.py
@@ -0,0 +1,18 @@
+[
+    [
+        0.115249326,
+        0.072633194,
+        0.050491687,
+        0.065859452,
+        0.088723517,
+        0.147331947,
+    ],
+    [
+        0.115249326,
+        0.072633194,
+        0.050491687,
+        0.065859452,
+        0.088723517,
+        0.147331947,
+    ],
+]
diff --git a/tests/expectations/econ-gender-x-ideology-weighted-row-prop-moe.py b/tests/expectations/econ-gender-x-ideology-weighted-row-prop-moe.py
@@ -0,0 +1,4 @@
+[
+    [0.02175933, 0.03332428, 0.04187784, 0.03716728, 0.0308031, 0.01415677],
+    [0.02346025, 0.03427124, 0.04290557, 0.0354381, 0.02346025, 0.01953654],
+]
diff --git a/tests/expectations/econ-gender-x-ideology-weighted-row-std-dev.py b/tests/expectations/econ-gender-x-ideology-weighted-row-std-dev.py
@@ -0,0 +1,4 @@
+[
+    [0.24824605, 0.3801874, 0.47777249, 0.42403103, 0.35142393, 0.16151057],
+    [0.26765143, 0.39099102, 0.48949758, 0.40430337, 0.26765143, 0.22288691],
+]
diff --git a/tests/expectations/econ-gender-x-ideology-weighted-table-prop-moe.py b/tests/expectations/econ-gender-x-ideology-weighted-table-prop-moe.py
@@ -0,0 +1,18 @@
+[
+    [
+        0.0110701312,
+        0.0175249771,
+        0.0236182549,
+        0.0199602358,
+        0.016037868,
+        0.0071269548,
+    ],
+    [
+        0.0119745296,
+        0.0181024584,
+        0.0247465565,
+        0.0188321449,
+        0.0119745296,
+        0.0099024628,
+    ],
+]