Skip to content

Commit

Permalink
Merge ebaafdb into 4ceebee
Browse files Browse the repository at this point in the history
  • Loading branch information
ernestoarbitrio committed Jun 9, 2020
2 parents 4ceebee + ebaafdb commit f1c7826
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 144 deletions.
174 changes: 41 additions & 133 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def ndim(self):
"""int count of dimensions for this partition."""
return len(self._dimensions)

@lazyproperty
def population_fraction(self):
"""Returns the population fraction of the cube"""
return self._cube.population_fraction

@lazyproperty
def shape(self):
"""Tuple of int vector counts for this partition.
Expand Down Expand Up @@ -301,11 +306,6 @@ def population_counts(self):
self.table_proportions * self._population * self._cube.population_fraction
)

@lazyproperty
def population_fraction(self):
"""Returns the population fraction of the cube"""
return self._cube.population_fraction

@lazyproperty
def pvals(self):
return np.array([row.pvals for row in self._matrix.rows])
Expand Down Expand Up @@ -436,77 +436,43 @@ def scale_means_rows_margin(self):

@lazyproperty
def scale_median_column(self):
""" -> np.int64 array of the columns scale median
The median is calculated in a way that assumes that the n point scale represents
a continuous random variable rather than n discrete categories.
Steps:
1. The middle point is calculated by dividing the sum of the counts by 2 if the
total counts is odd, for even number of entries, so we would actually take
the mean of the values at positions middle and middle + 1
2. Identify in which category (our numeric values) this middle point falls
""" -> np.int64 ndarray of the columns scale median
The median is calculated using the standard algebra applied to the numeric
values repeated for each related counts value
"""
if np.all(np.isnan(self._columns_dimension_numeric)):
return None

not_a_nan_index = ~np.isnan(self._columns_dimension_numeric)
numeric_values = self._columns_dimension_numeric[not_a_nan_index]
counts = self.counts[:, not_a_nan_index]
total_counts = np.sum(counts, axis=1)
# --- sorting counts by numeric values ---
sorted_counts = np.array(list(zip(*sorted(zip(numeric_values, counts.T))))[1]).T
# --- calc of the middle points considering even and odd case ---
middle_points = (
total_counts // 2
if counts.shape[1] % 2 == 1
else ((total_counts // 2) + ((total_counts // 2) + 1)) / 2
counts = self.counts[:, not_a_nan_index].astype("int64")
scale_median = np.array(
[
self._median(np.repeat(numeric_values, counts[i, :]))
for i in range(counts.shape[0])
]
)
# --- the median indices represent a list of idxs that express where the ---
# --- middle point falls ---
median_indices = self._compose_median_col_idxs(sorted_counts, middle_points)
# --- returns for each column the numeric value corrispondent to the idx ---
# --- of the median_indices ---
return [
np.sort(numeric_values)[i] if not np.isnan(i) else np.nan
for i in median_indices
]
return scale_median

@lazyproperty
def scale_median_row(self):
""" -> np.int64 array of the rows scale median
The median is calculated in a way that assumes that the n point scale represents
a continuous random variable rather than n discrete categories.
Steps:
1. The middle point is calculated by dividing the sum of the counts by 2 if the
total counts is odd, for even number of entries, so we would actually take
the mean of the values at positions middle and middle + 1
2. Identify in which category (our numeric values) this middle point falls
""" -> np.int64 ndarray of the rows scale median
The median is calculated using the standard algebra applied to the numeric
values repeated for each related counts value
"""
if np.all(np.isnan(self._rows_dimension_numeric)):
return None

not_a_nan_index = ~np.isnan(self._rows_dimension_numeric)
numeric_values = self._rows_dimension_numeric[not_a_nan_index]
counts = self.counts[not_a_nan_index, :]
total_counts = np.sum(counts, axis=0)
# --- sorting counts by numeric values ---
sorted_counts = np.array(list(zip(*sorted(zip(numeric_values, counts))))[1])
# --- calc of the middle points considering even and odd case ---
middle_points = (
total_counts // 2
if counts.shape[0] % 2 == 1
else ((total_counts // 2) + ((total_counts // 2) + 1)) / 2
counts = self.counts[not_a_nan_index, :].astype("int64")
scale_median = np.array(
[
self._median(np.repeat(numeric_values, counts[:, i]))
for i in range(counts.shape[1])
]
)
# --- the median indices represent a list of idx that express where the ---
# --- middle point falls ---
median_indices = self._compose_median_row_idxs(sorted_counts, middle_points)
# --- returns for each row the numeric value corrispondent to the idx ---
# --- of the median_indices ---
return [
np.sort(numeric_values)[i] if not np.isnan(i) else np.nan
for i in median_indices
]
return scale_median

@lazyproperty
def scale_median_column_margin(self):
Expand All @@ -518,16 +484,10 @@ def scale_median_column_margin(self):
columns_margin = columns_margin[0]
not_a_nan_index = ~np.isnan(self._columns_dimension_numeric)
numeric_values = self._columns_dimension_numeric[not_a_nan_index]
counts = columns_margin[not_a_nan_index]
middle_point = (
np.sum(counts) // 2
if len(counts) % 2 == 1
else ((np.sum(counts) // 2) + ((np.sum(counts) // 2) + 1)) / 2
)
sorted_counts = np.array(list(zip(*sorted(zip(numeric_values, counts))))[1])
median_index = np.where(np.cumsum(sorted_counts) > middle_point)[0]
counts = columns_margin[not_a_nan_index].astype("int64")
unwrapped_num_values = np.repeat(numeric_values, counts)
return (
np.sort(numeric_values)[median_index[0]] if median_index.size != 0 else None
np.median(unwrapped_num_values) if unwrapped_num_values.size != 0 else None
)

@lazyproperty
Expand All @@ -540,16 +500,10 @@ def scale_median_row_margin(self):
rows_margin = rows_margin[:, 0]
not_a_nan_index = ~np.isnan(self._rows_dimension_numeric)
numeric_values = self._rows_dimension_numeric[not_a_nan_index]
counts = rows_margin[not_a_nan_index]
middle_point = (
np.sum(counts) // 2
if len(counts) % 2 == 1
else ((np.sum(counts) // 2) + ((np.sum(counts) // 2) + 1)) / 2
)
sorted_counts = np.array(list(zip(*sorted(zip(numeric_values, counts))))[1])
median_index = np.where(np.cumsum(sorted_counts) > middle_point)[0]
counts = rows_margin[not_a_nan_index].astype("int64")
unwrapped_num_values = np.repeat(numeric_values, counts)
return (
np.sort(numeric_values)[median_index[0]] if median_index.size != 0 else None
np.median(unwrapped_num_values) if unwrapped_num_values.size != 0 else None
)

@lazyproperty
Expand Down Expand Up @@ -741,32 +695,6 @@ def _columns_variance(self):
self.counts / self.columns_margin * (1 - self.counts / self.columns_margin)
)

def _compose_median_row_idxs(self, sorted_counts, middle_points):
""" -> list of idx corresponding to the median values of rows scale"""
median_indices = []
for idx in range(sorted_counts.shape[1]):
idx_array = np.where(
np.cumsum(sorted_counts, axis=0)[:, idx] > middle_points[idx]
)[0]
if idx_array.size != 0:
median_indices.append(idx_array[0])
else:
median_indices.append(np.nan)
return median_indices

def _compose_median_col_idxs(self, sorted_counts, middle_points):
""" -> list of idx corresponding to the median values of cols scale"""
median_indices = []
for idx in range(sorted_counts.shape[0]):
idx_array = np.where(
np.cumsum(sorted_counts, axis=1)[idx, :] > middle_points[idx]
)[0]
if idx_array.size != 0:
median_indices.append(idx_array[0])
else:
median_indices.append(np.nan)
return median_indices

@lazyproperty
def _dimensions(self):
"""tuple of (rows_dimension, columns_dimension) Dimension objects."""
Expand All @@ -782,6 +710,9 @@ def _matrix(self):
"""The TransformedMatrix object for this slice."""
return TransformedMatrix.matrix(self._cube, self._dimensions, self._slice_idx)

def _median(self, values):
return np.median(values) if values.size != 0 else np.nan

@lazyproperty
def _rows_dimension(self):
return self._dimensions[0]
Expand Down Expand Up @@ -883,11 +814,6 @@ def population_counts(self):
* self._cube.population_fraction
)

@lazyproperty
def population_fraction(self):
"""Returns the population fraction of the cube"""
return self._cube.population_fraction

@lazyproperty
def row_base(self):
return np.array([row.base for row in self._stripe.rows])
Expand Down Expand Up @@ -954,33 +880,15 @@ def scale_mean(self):
def scale_median(self):
""" -> np.int64, the median of scales
The median is calculated in a way that assumes that the n point scale represents
a continuous random variable rather than n discrete categories.
Steps:
1. The middle point is calculated by dividing the sum of the counts by 2 if the
total counts is odd, for even number of entries, so you would actually take
the mean of the values at positions middle and middle + 1
2. Identify in which category (our numeric values) this middle point falls
The median is calculated using the standard algebra applied to the numeric
values repeated for each related counts value
"""
if np.all(np.isnan(self._numeric_values)):
return None
numeric_values = self._numeric_values[self._numeric_values_mask]
counts = self._counts_as_array[self._numeric_values_mask]
middle_point = (
np.sum(counts) // 2
if len(counts) % 2 == 1
else ((np.sum(counts) // 2) + ((np.sum(counts) // 2) + 1)) / 2
)
sorted_counts = np.array(list(zip(*sorted(zip(numeric_values, counts))))[1])
# ---the median index contains all the indices where the middle point is lower
# ---than the cumsum elements
median_index = np.where(np.cumsum(sorted_counts) > middle_point)[0]
# ---returns the corresponding numeric value given the first median index---
return (
np.sort(numeric_values)[median_index[0]]
if median_index.size != 0
else np.nan
)
counts = self._counts_as_array[self._numeric_values_mask].astype("int64")
unwrapped_numeric_values = np.repeat(numeric_values, counts)
return np.median(unwrapped_numeric_values)

@lazyproperty
def scale_std_dev(self):
Expand Down
24 changes: 13 additions & 11 deletions tests/integration/test_scale_additional_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_cat_x_ca_cat_x_items():
np.testing.assert_almost_equal(
slice_.scale_std_err_row, [0.216994, 0.3202223, 0.302742, 0.3055608]
)
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 4, 1])
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 2.5, 1])
assert slice_.scale_median_row_margin == 1
assert slice_.scale_median_column_margin is None
assert slice_.scale_std_dev_column is None
Expand Down Expand Up @@ -321,7 +321,7 @@ def test_cat_x_cat_with_hs():
[0.0847993, 0.0533474, 0.0515249, 0.0718528, 0.104389, 0.2488725],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [2, 2, 2, 1, 1, 5])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 4, 4])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3.5, 4])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_column_margin == 3

Expand All @@ -344,7 +344,7 @@ def test_cat_x_cat_with_hs():
[0.0847993, 0.0533474, 0.0515249, 0.0718528, 0.104389, 0.2488725],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [2, 2, 2, 1, 1, 5])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 4, 4])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 3.5, 4])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_column_margin == 3

Expand Down Expand Up @@ -375,7 +375,7 @@ def test_cat_x_cat_with_hs_on_both_dims():
[0.0847993, 0.0533474, 0.0515249, 0.0718528, 0.104389, 0.2488725],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [2, 2, 2, 1, 1, 5])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 4, 4])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3.5, 4])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_column_margin == 3

Expand All @@ -398,7 +398,7 @@ def test_cat_x_cat_with_hs_on_both_dims():
[0.0847993, 0.0533474, 0.0515249, 0.0388506, 0.0718528, 0.104389, 0.2488725],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [2, 2, 2, 2, 1, 1, 5])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 4, 4])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 3.5, 4])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_column_margin == 3

Expand Down Expand Up @@ -516,7 +516,9 @@ def test_cat_x_cat_pruning_and_hs():
[0.1102738, 0.7064704, 0.4111442, 0.17486, np.nan, 0.4784233],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 1, np.nan, 3])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 1, 2, 2, np.nan, 2])
np.testing.assert_almost_equal(
slice_.scale_median_column, [2, 1, 2, 2, np.nan, np.nan]
)
assert slice_.scale_median_row_margin == 1
assert slice_.scale_median_column_margin == 2

Expand All @@ -540,7 +542,7 @@ def test_cat_x_cat_pruning_and_hs():
)
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 1, 1, np.nan, 3])
np.testing.assert_almost_equal(
slice_.scale_median_column, [2, 2, 1, 2, 2, np.nan, 2]
slice_.scale_median_column, [2, 2, 1, 2, 2, np.nan, np.nan]
)
assert slice_.scale_median_row_margin == 1
assert slice_.scale_median_column_margin == 2
Expand All @@ -566,7 +568,7 @@ def test_cat_x_cat_pruning_and_hs():
[0.1102738, 0.1933713, 0.7064704, 0.4111442, 0.17486, 0.4784233],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 1, 1, 3])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 2, 3, 2, 2, 2])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 2, 1, 2, 2, np.nan])
assert slice_.scale_median_row_margin == 1
assert slice_.scale_median_column_margin == 2

Expand All @@ -591,7 +593,7 @@ def test_cat_x_cat_pruning_and_hs():
[0.1102738, 0.7064704, 0.4111442, 0.17486, 0.4784233],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 1, 1, 3])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 3, 2, 2, 2])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 1, 2, 2, np.nan])
assert slice_.scale_median_row_margin == 1
assert slice_.scale_median_column_margin == 2

Expand Down Expand Up @@ -629,7 +631,7 @@ def test_bivariate_cat():
[0.0558603, 0.0486317, 0.0447584, 0.063111, 0.7698004, 0.1938773],
)
np.testing.assert_almost_equal(slice_.scale_median_row, [2, 2, 2, 1, 1, 5])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 4, 4])
np.testing.assert_almost_equal(slice_.scale_median_column, [4, 3, 3, 3, 3.5, 4])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_column_margin == 3

Expand Down Expand Up @@ -741,7 +743,7 @@ def test_cat_x_cat_arr_pets_first():
np.testing.assert_almost_equal(slice_.scale_std_err_column, [0.0774597, 0.0724569])
np.testing.assert_almost_equal(slice_.scale_median_row, [1, 2])
np.testing.assert_almost_equal(slice_.scale_median_column, [2, 2])
assert slice_.scale_median_row_margin == 2
assert slice_.scale_median_row_margin == 1.5
assert slice_.scale_median_column_margin == 2

slice_ = Cube(CR.FRUIT_X_PETS_ARRAY_PETS_FIRST).partitions[2]
Expand Down

0 comments on commit f1c7826

Please sign in to comment.