Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unweighted n in tstat variance 165485967 #154

Merged
merged 3 commits into from
Apr 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/cr/cube/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

"""Initialization module for crunch-cube package."""

__version__ = "1.9.16"
__version__ = "1.9.17"
6 changes: 3 additions & 3 deletions src/cr/cube/measures/pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ def t_stats(self):
axis=0, include_transforms_for_dims=self._hs_dims
)
diff = props - props[:, [self._col_idx]]
margin = self._slice.margin(
axis=0, weighted=self._weighted, include_transforms_for_dims=self._hs_dims
unweighted_margin = self._slice.margin(
axis=0, weighted=False, include_transforms_for_dims=self._hs_dims
)
var_props = props * (1.0 - props) / margin
var_props = props * (1.0 - props) / unweighted_margin
se_diff = np.sqrt(var_props + var_props[:, [self._col_idx]])
return diff / se_diff

Expand Down
227 changes: 227 additions & 0 deletions tests/fixtures/cat-x-cat-weighted-ttests.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
{
"value": {
"query": {
"dimensions": [
{
"variable": "/api/datasets/123/variables/ac62f49fe0944baf8030839227d7cf06"
}, {
"variable": "/api/datasets/123/variables/772559e879734d5dbda31d67bb0e7a2c/"
}
],
"filters": [],
"measures": {
"count": {
"args": [],
"function": "cube_count"
}
},
"weight": "/api/datasets/123/variables/c1820eb7befd4704beacfdbcb430969c/"
},
"result": {
"counts": [
10,
17,
110,
13,
0,
19,
11,
113,
11,
0,
11,
17,
14,
14,
0,
3,
0,
3,
0,
0,
18,
13,
11,
3,
3,
3,
3,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
],
"dimensions": [
{
"derived": false,
"references": {
"alias": "ShutdownBlame",
"description": "If President Oba",
"discarded": false,
"header_order": 0,
"name": "ShutdownBlame"
},
"type": {
"categories": [
{
"id": 1,
"missing": false,
"name": "President Obama",
"numeric_value": 1
}, {
"id": 2,
"missing": false,
"name": "Republicans",
"numeric_value": 2
}, {
"id": 3,
"missing": false,
"name": "Both",
"numeric_value": 3
}, {
"id": 4,
"missing": false,
"name": "Neither",
"numeric_value": 4
}, {
"id": 5,
"missing": false,
"name": "Not sure",
"numeric_value": 5
}, {
"id": 8,
"missing": true,
"name": "Skipped",
"numeric_value": 8
}, {
"id": 9,
"missing": true,
"name": "Not Asked",
"numeric_value": 9
}, {
"id": -1,
"missing": true,
"name": "No Data",
"numeric_value": null
}
],
"class": "categorical",
"ordinal": false
}
}, {
"derived": false,
"references": {
"alias": "Age4",
"description": "4 Category Age",
"discarded": false,
"header_order": 13,
"name": "Age4"
},
"type": {
"categories": [
{
"id": 1,
"missing": false,
"name": "18-29",
"numeric_value": 1
}, {
"id": 2,
"missing": false,
"name": "30-44",
"numeric_value": 2
}, {
"id": 3,
"missing": false,
"name": "45-64",
"numeric_value": 3
}, {
"id": 4,
"missing": false,
"name": "65+",
"numeric_value": 4
}, {
"id": -1,
"missing": true,
"name": "No Data",
"numeric_value": null
}
],
"class": "categorical",
"ordinal": false
}
}
],
"element": "crunch:cube",
"measures": {
"count": {
"data": [
60.86625141813965,
97.88936399229519,
100.15929984533358,
53.85016845098138,
0,
79.36184123878606,
91.08041828026134,
113.52172513707207,
51.24133618727803,
0,
51.6860301097489,
57.19341887477966,
94.46614309659532,
34.21679409150737,
0,
7.1814887299023,
0.648732097517952,
2.160525278672988,
0,
0,
48.05709086803253,
23.04698949579007,
21.86062399732678,
3.2671706433211516,
1.24732636818006,
4.27630335512173,
2.720958443356048,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
],
"metadata": {
"derived": true,
"references": {},
"type": {
"class": "numeric",
"integer": false,
"missing_reasons": {
"No Data": -1
},
"missing_rules": {}
}
},
"n_missing": 4
}
},
"missing": 4,
"n": 1000
}
}
}
41 changes: 41 additions & 0 deletions tests/integration/test_pairwise_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,44 @@ def test_pairwise_indices_larger_and_smaller(self):
)
pairwise_indices = cube.slices[0].pairwise_indices(only_larger=False)
np.testing.assert_array_equal(pairwise_indices, expected_indices)

def test_ttests_use_unweighted_n_for_variance(self):
"""The weights on this cube demonstrate much higher variance (less
extreme t values, and higher associated p-values) than if weighted_n
were used in the variance estimate of the test statistic.
"""
cube = CrunchCube(CR.CAT_X_CAT_WEIGHTED_TTESTS)
actual = cube.slices[0].pairwise_significance_tests(
column_idx=0, hs_dims=(0, 1)
)
expected_tstats = np.array(
[
[0.0, 1.3892930788974391, 0.8869425734660505, 1.402945620973322],
[0.0, 0.1903540333363253, 0.30894158244285624, 0.3994739596013725],
[0.0, 0.03761142927757482, 1.2682277741610029, 0.36476016345069556],
[0.0, -1.187392798652706, -1.0206496663686406, -1.35111583891054],
[0.0, -1.742783579889951, -2.425391682127969, -3.0738474093706927],
]
).reshape(5, 4)
expected_pvals = np.array(
[
[1.0, 0.1673820620286901, 0.37579738470724267, 0.16373028998420036],
[1.0, 0.8493616019040273, 0.7575734897713429, 0.6903959137827367],
[1.0, 0.9700615941125716, 0.20566822638024163, 0.7160606992310101],
[1.0, 0.23747780923355655, 0.30821629616167123, 0.17970733830083074],
[1.0, 0.0839987707197456, 0.015862691173528676, 0.002723927327002773],
]
).reshape(5, 4)
np.testing.assert_almost_equal(actual.t_stats, expected_tstats)
np.testing.assert_almost_equal(actual.p_vals, expected_pvals)
pairwise_indices = cube.slices[0].pairwise_indices()
expected_indices = np.array(
[
[(), (), (), ()],
[(), (), (), ()],
[(), (), (), ()],
[(), (), (), ()],
[(2, 3), (), (), ()],
]
)
np.testing.assert_array_equal(pairwise_indices, expected_indices)