From 34ae6411654eaacddc32a0265db8a2af82e7d208 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 07:45:35 -0400 Subject: [PATCH 01/13] Fixed errors in documentation --- rsmtool/analyzer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py index ddf5016f6..f9c52b71f 100644 --- a/rsmtool/analyzer.py +++ b/rsmtool/analyzer.py @@ -601,8 +601,8 @@ def metrics_helper(human_scores, smd_method='unpooled', use_diff_std_means=False): """ - This is a helper function that computes some basic agreement - and association metrics between the system scores and the + This is a helper function that computes several basic + association metrics between the system scores and the human scores. Parameters @@ -620,16 +620,16 @@ def metrics_helper(human_scores, population_system_score_sd : float, optional Reference standard deviation for system scores. If `smd_method='williamson'`, this is used to compute SMD and should be the standard deviation for the whole population.If - `use_diff_std_means=True`, this must be used with `population_human_score_mn`. + `use_diff_std_means=True`, this must be used with `population_system_score_mn`. Otherwise, it is ignored. Defaults to None. population_human_score_mn : float, optional Reference mean for human scores. If `use_diff_std_means=True`, this must be used with - `population_human_score_mn`. Otherwise, it is ignored. + `population_human_score_sd`. Otherwise, it is ignored. Defaults to None. population_system_score_mn : float, optional Reference mean for system scores. If `use_diff_std_means=True`, this must be used with - `population_human_score_mn`. Otherwise, it is ignored. + `population_system_score_sd`. Otherwise, it is ignored. Defaults to None. smd_method : {'williamson', 'johnson', pooled', 'unpooled'}, optional The SMD method to use, only used if `use_diff_std_means=False`. From 8207e8b98457df241d3c5c7989d3f4b48a98b91d Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 09:16:19 -0400 Subject: [PATCH 02/13] Correct treatment of zero standard deviation --- rsmtool/utils/metrics.py | 13 +++++++++++-- tests/test_analyzer.py | 40 ++++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 12 ++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py index 1f44c4fed..1ccc19058 100644 --- a/rsmtool/utils/metrics.py +++ b/rsmtool/utils/metrics.py @@ -356,11 +356,13 @@ def difference_of_standardized_means(y_true_observed, y_pred_population_params = [population_y_pred_mn, population_y_pred_sd] - if any(y_true_observed_population_params) and not all(y_true_observed_population_params): + if len([param for param in y_true_observed_population_params + if param is None]) == 1: raise ValueError('You must pass both `population_y_true_observed_mn` and ' '`population_y_true_observed_sd` or neither.') - if any(y_pred_population_params) and not all(y_pred_population_params): + if len([param for param in y_pred_population_params + if param is None]) == 1: raise ValueError('You must pass both `population_y_pred_mn` and ' '`population_y_pred_sd` or neither.') @@ -382,6 +384,13 @@ def difference_of_standardized_means(y_true_observed, population_y_pred_mn) = (np.std(y_pred, ddof=ddof), np.mean(y_pred)) + # if any of the standard deviations raise a warning and return None + if population_y_pred_sd == 0 or population_y_true_observed_sd == 0: + warnings.warn("Population standard deviations for the computation of " + "DSM are zero. No value will be computed") + return None + + # calculate the z-scores for observed and predicted y_true_observed_subgroup_z = ((y_true_observed - population_y_true_observed_mn) / population_y_true_observed_sd) diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index 7ead956cb..fe86a8677 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -182,6 +182,46 @@ def test_metrics_helper_population_sds(self): assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index()) assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index()) + + def test_metrics_helper_zero_system_sd(self): + human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1] + system_score = [2.54]*10 + computed_metrics1 = Analyzer.metrics_helper(human_scores, + system_score) + expected_metrics1 = pd.Series({'N': 10, + 'R2': -0.015806451612903283, + 'RMSE': 1.122319027727856, + 'SMD':0.11927198519188371, + 'adj_agr': 50.0, + 'corr': None, + 'exact_agr': 0, + 'h_max': 4, + 'h_mean': 2.4, + 'h_min': 1.0, + 'h_sd': 1.1737877907772674, + 'kappa': 0, + 'sys_max': 2.54, + 'sys_mean': 2.54, + 'sys_min': 2.54, + 'sys_sd': 0, + 'wtkappa': 0}) + # now compute DSM + computed_metrics2 = Analyzer.metrics_helper(human_scores, + system_score, + use_diff_std_means=True) + + # the only number that should change is the SMD + expected_metrics2 = expected_metrics1.copy() + expected_metrics2.drop("SMD", inplace=True) + expected_metrics2['DSM'] = None + assert_series_equal(computed_metrics1.sort_index(), + expected_metrics1.sort_index(), + check_dtype=False) + assert_series_equal(computed_metrics2.sort_index(), + expected_metrics2.sort_index(), + check_dtype=False) + + def test_compute_pca_less_samples_than_features(self): # test pca when we have less samples than # features. In this case the number of components diff --git a/tests/test_utils.py b/tests/test_utils.py index d0481741b..115684a31 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -423,6 +423,18 @@ def test_difference_of_standardized_means_with_no_population_info(): assert issubclass(warning_list[1].category, UserWarning) +def test_difference_of_standardized_means_zero_population_sd(): + y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), + np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) + expected = None + diff_std_means = difference_of_standardized_means(y_true, y_pred, + population_y_true_observed_mn=2.44, + population_y_true_observed_sd=0.54, + population_y_pred_mn=2.44, + population_y_pred_sd=0) + eq_(diff_std_means, expected) + + def test_quadratic_weighted_kappa(): expected_qwk = -0.09210526315789469 From fe68b222fc0be6c29aab3d94897f85824d955871 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 09:17:02 -0400 Subject: [PATCH 03/13] Correct treatment of zero standard deviation --- tests/test_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 115684a31..89d3ae3f0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -423,7 +423,7 @@ def test_difference_of_standardized_means_with_no_population_info(): assert issubclass(warning_list[1].category, UserWarning) -def test_difference_of_standardized_means_zero_population_sd(): +def test_difference_of_standardized_means_zero_population_sd_pred(): y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) expected = None @@ -435,6 +435,18 @@ def test_difference_of_standardized_means_zero_population_sd(): eq_(diff_std_means, expected) +def test_difference_of_standardized_means_zero_population_sd_human(): + y_pred, y_true = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), + np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) + expected = None + diff_std_means = difference_of_standardized_means(y_true, y_pred, + population_y_pred_observed_mn=2.44, + population_y_pred_observed_sd=0.54, + population_y_true_mn=2.44, + population_y_true_sd=0) + eq_(diff_std_means, expected) + + def test_quadratic_weighted_kappa(): expected_qwk = -0.09210526315789469 From 3eb75652b4b56550b3f2777cea5fe7bc91a735e8 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 09:21:31 -0400 Subject: [PATCH 04/13] Few more fixes --- rsmtool/utils/metrics.py | 8 +++++--- tests/test_utils.py | 21 +++++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py index 1ccc19058..1bfb0cb8a 100644 --- a/rsmtool/utils/metrics.py +++ b/rsmtool/utils/metrics.py @@ -370,14 +370,16 @@ def difference_of_standardized_means(y_true_observed, 'thus, the calculated z-scores will be zero.') # if the population means and standard deviations were not provided, calculate from the data - if not population_y_true_observed_mn or not population_y_true_observed_sd: + # We only check for mean since the function requires + # both of these to be set of both to be None + if population_y_true_observed_mn is None: warnings.warn(warning_msg.format('y_true_observed')) (population_y_true_observed_sd, population_y_true_observed_mn) = (np.std(y_true_observed, ddof=ddof), np.mean(y_true_observed)) - if not population_y_pred_mn or not population_y_pred_sd: + if population_y_pred_mn is None: warnings.warn(warning_msg.format('y_pred')) (population_y_pred_sd, @@ -387,7 +389,7 @@ def difference_of_standardized_means(y_true_observed, # if any of the standard deviations raise a warning and return None if population_y_pred_sd == 0 or population_y_true_observed_sd == 0: warnings.warn("Population standard deviations for the computation of " - "DSM are zero. No value will be computed") + "DSM is zero. No value will be computed.") return None diff --git a/tests/test_utils.py b/tests/test_utils.py index 89d3ae3f0..5b0c8417c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -425,7 +425,7 @@ def test_difference_of_standardized_means_with_no_population_info(): def test_difference_of_standardized_means_zero_population_sd_pred(): y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), - np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) + np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2])) expected = None diff_std_means = difference_of_standardized_means(y_true, y_pred, population_y_true_observed_mn=2.44, @@ -436,14 +436,23 @@ def test_difference_of_standardized_means_zero_population_sd_pred(): def test_difference_of_standardized_means_zero_population_sd_human(): + y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), + np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2])) + expected = None + diff_std_means = difference_of_standardized_means(y_true, y_pred, + population_y_pred_mn=2.44, + population_y_pred_sd=0.54, + population_y_true_observed_mn=2.44, + population_y_true_observed_sd=0) + eq_(diff_std_means, expected) + + +def test_difference_of_standardized_means_zero_population_computed(): + # sd is computed from the data and is zero y_pred, y_true = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]), np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])) expected = None - diff_std_means = difference_of_standardized_means(y_true, y_pred, - population_y_pred_observed_mn=2.44, - population_y_pred_observed_sd=0.54, - population_y_true_mn=2.44, - population_y_true_sd=0) + diff_std_means = difference_of_standardized_means(y_true, y_pred) eq_(diff_std_means, expected) From 21134fe55b4e6e7752e679be6814dec1d755d85a Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 12:19:29 -0400 Subject: [PATCH 05/13] PEP8 fixes --- tests/test_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index fe86a8677..c33a9aa72 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -185,13 +185,13 @@ def test_metrics_helper_population_sds(self): def test_metrics_helper_zero_system_sd(self): human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1] - system_score = [2.54]*10 + system_score = [2.54] * 10 computed_metrics1 = Analyzer.metrics_helper(human_scores, system_score) expected_metrics1 = pd.Series({'N': 10, 'R2': -0.015806451612903283, 'RMSE': 1.122319027727856, - 'SMD':0.11927198519188371, + 'SMD': 0.11927198519188371, 'adj_agr': 50.0, 'corr': None, 'exact_agr': 0, From dbbab029baa19232c398389a766b200ce26cc460 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 12:21:36 -0400 Subject: [PATCH 06/13] Minor docstring changes --- rsmtool/utils/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py index 1bfb0cb8a..26ba78480 100644 --- a/rsmtool/utils/metrics.py +++ b/rsmtool/utils/metrics.py @@ -386,7 +386,8 @@ def difference_of_standardized_means(y_true_observed, population_y_pred_mn) = (np.std(y_pred, ddof=ddof), np.mean(y_pred)) - # if any of the standard deviations raise a warning and return None + # if any of the standard deviations equal zero + # raise a warning and return None if population_y_pred_sd == 0 or population_y_true_observed_sd == 0: warnings.warn("Population standard deviations for the computation of " "DSM is zero. No value will be computed.") From 85d6b04f02203ff406982edaca69e95a9d15e246 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 13:53:45 -0400 Subject: [PATCH 07/13] New test files --- rsmtool/utils/metrics.py | 10 ++- .../lr_eval_system_score_constant.json | 12 ++++ ..._eval_system_score_constant_confMatrix.csv | 6 ++ ...eval_system_score_constant_consistency.csv | 2 + ...ystem_score_constant_consistency_by_L1.csv | 6 ++ ...score_constant_consistency_by_QUESTION.csv | 4 ++ ...system_score_constant_data_composition.csv | 2 + ..._score_constant_data_composition_by_L1.csv | 5 ++ ..._constant_data_composition_by_QUESTION.csv | 3 + ...eval_system_score_constant_degradation.csv | 4 ++ ...re_constant_disattenuated_correlations.csv | 4 ++ ...stant_disattenuated_correlations_by_L1.csv | 6 ++ ...disattenuated_correlations_by_QUESTION.csv | 4 ++ ...tem_score_constant_estimates_csd_by_L1.csv | 5 ++ ...ore_constant_estimates_csd_by_QUESTION.csv | 3 + ...tem_score_constant_estimates_osa_by_L1.csv | 5 ++ ...ore_constant_estimates_osa_by_QUESTION.csv | 3 + ...tem_score_constant_estimates_osd_by_L1.csv | 5 ++ ...ore_constant_estimates_osd_by_QUESTION.csv | 3 + .../lr_eval_system_score_constant_eval.csv | 4 ++ ..._eval_system_score_constant_eval_by_L1.csv | 6 ++ ...system_score_constant_eval_by_QUESTION.csv | 4 ++ ..._eval_system_score_constant_eval_short.csv | 2 + ..._score_constant_fairness_metrics_by_L1.csv | 3 + ..._constant_fairness_metrics_by_QUESTION.csv | 3 + ...l_system_score_constant_pred_processed.csv | 70 +++++++++++++++++++ ..._eval_system_score_constant_score_dist.csv | 6 ++ ...ore_constant_test_excluded_composition.csv | 4 ++ ...ystem_score_constant_test_human_scores.csv | 70 +++++++++++++++++++ ...al_system_score_constant_test_metadata.csv | 70 +++++++++++++++++++ ...stem_score_constant_test_other_columns.csv | 70 +++++++++++++++++++ ..._system_score_constant_true_score_eval.csv | 4 ++ tests/test_experiment_rsmeval.py | 2 + 33 files changed, 408 insertions(+), 2 deletions(-) create mode 100644 tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py index 26ba78480..b4bb5affd 100644 --- a/rsmtool/utils/metrics.py +++ b/rsmtool/utils/metrics.py @@ -387,8 +387,14 @@ def difference_of_standardized_means(y_true_observed, np.mean(y_pred)) # if any of the standard deviations equal zero - # raise a warning and return None - if population_y_pred_sd == 0 or population_y_true_observed_sd == 0: + # raise a warning and return None. + # We use np.isclose since sometimes sd for float + # values is a value very close to 0. + # We use the same tolerance as used for identifying + # features with zero standard deviation + + if np.isclose(population_y_pred_sd, 0, atol=1e-07) \ + or np.isclose(population_y_true_observed_sd, 0, atol=1e-07): warnings.warn("Population standard deviations for the computation of " "DSM is zero. No value will be computed.") return None diff --git a/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json b/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json new file mode 100644 index 000000000..be6ab5228 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json @@ -0,0 +1,12 @@ +{ + "predictions_file": "../../files/predictions_same_system_score_with_subgroups_subset_double_scored.csv", + "system_score_column": "score", + "description": "An evaluation of LinearRegression predictions.", + "human_score_column": "h1", + "second_human_score_column": "h2", + "id_column": "id", + "experiment_id": "lr_eval_system_score_constant", + "subgroups": "QUESTION, L1", + "trim_min": 1, + "trim_max": 6 +} diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv new file mode 100644 index 000000000..d12819015 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv @@ -0,0 +1,6 @@ +,1,2,3,4,5 +1,0,0,0,0,0 +2,0,0,0,0,0 +3,0,0,0,0,0 +4,1,9,18,34,7 +5,0,0,0,0,0 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv new file mode 100644 index 000000000..4ffd4c7b9 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv @@ -0,0 +1,2 @@ +,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,SMD +,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,-0.06622863508167891 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv new file mode 100644 index 000000000..17956bae6 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv @@ -0,0 +1,6 @@ +,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM +All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329 +Esperanto,12,3.5,1.0,2.0,5.0,3.25,0.8660254037844386,2.0,5.0,0.8922685978385125,0.85,0.6470588235294117,75.0,100.0,-0.13791461209462752 +Klingon,11,3.4545454545454546,0.9341987329938276,2.0,5.0,3.6363636363636362,0.8090398349558905,2.0,5.0,0.9021097956087901,0.8720930232558141,0.721518987341772,81.81818181818183,100.0,0.34772515873397636 +Navi,11,3.6363636363636362,0.8090398349558905,2.0,5.0,3.6363636363636362,0.6741998624632421,2.0,4.0,0.8333333333333335,0.8196721311475409,0.6666666666666666,81.81818181818183,100.0,0.14585044666295055 +Vulcan,15,3.3333333333333335,1.1126972805283737,1.0,5.0,3.2,1.082325538564332,1.0,5.0,0.771046169254443,0.7647058823529412,0.2857142857142857,46.666666666666664,100.0,-0.009179051765826295 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv new file mode 100644 index 000000000..7487df3ae --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv @@ -0,0 +1,4 @@ +,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM +All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329 +QUESTION_1,40,3.5,0.9336995618478525,1.0,5.0,3.425,0.9026314805852884,1.0,5.0,0.8366627923660599,0.8333333333333334,0.5914577530176417,72.5,100.0,0.05919229464910923 +QUESTION_2,9,3.3333333333333335,1.118033988749895,2.0,5.0,3.3333333333333335,0.8660254037844386,2.0,4.0,0.7745966692414833,0.75,0.3207547169811321,55.55555555555556,100.0,0.14099763908654464 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv new file mode 100644 index 000000000..fab4f010b --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv @@ -0,0 +1,2 @@ +partition,responses,QUESTION,L1 +Evaluation,69,2,4 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv new file mode 100644 index 000000000..c3ba54f7f --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv @@ -0,0 +1,5 @@ +L1,N responses +Esperanto,15 +Klingon,19 +Navi,14 +Vulcan,21 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv new file mode 100644 index 000000000..9699d98cc --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv @@ -0,0 +1,3 @@ +QUESTION,N responses +QUESTION_1,40 +QUESTION_2,29 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv new file mode 100644 index 000000000..f84bb7da4 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv @@ -0,0 +1,4 @@ +,corr,kappa,wtkappa,exact_agr,adj_agr,SMD +raw,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708 +raw_trim,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708 +raw_trim_round,,-0.5490797546012269,-0.8175725986597168,-20.112392783200228,-14.492753623188406,0.5811554368860343 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv new file mode 100644 index 000000000..135af468a --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv @@ -0,0 +1,4 @@ +,corr_HM,corr_HH,sqrt_HH,corr_disattenuated +raw,,0.8218820738677033,0.9065771196471392, +raw_trim,,0.8218820738677033,0.9065771196471392, +raw_trim_round,,0.8218820738677033,0.9065771196471392, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv new file mode 100644 index 000000000..5b78776d8 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv @@ -0,0 +1,6 @@ +,corr_HM,corr_HH,sqrt_HH,corr_disattenuated +All data,,0.8218820738677033,0.9065771196471392, +Esperanto,,0.8922685978385125,0.9445997024340589, +Klingon,,0.9021097956087901,0.9497946070644906, +Navi,,0.8333333333333335,0.9128709291752769, +Vulcan,,0.771046169254443,0.8780923466552041, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv new file mode 100644 index 000000000..bc9e96db5 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv @@ -0,0 +1,4 @@ +,corr_HM,corr_HH,sqrt_HH,corr_disattenuated +All data,,0.8218820738677033,0.9065771196471392, +QUESTION_1,,0.8366627923660599,0.9146927311212547, +QUESTION_2,,0.7745966692414833,0.8801117367933934, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv new file mode 100644 index 000000000..b4a2497ca --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv @@ -0,0 +1,5 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (Vulcan),-0.4640000000000002,0.0,-0.4640000000000012,-0.4639999999999992 +Klingon,1.2385925618474403e-15,0.06294831850852728,-6.89666061407639e-17,2.5461517298356442e-15 +Esperanto,3.0531133177191805e-16,0.6631333657891718,-1.089425140581065e-15,1.7000478041249011e-15 +Navi,1.2073675392798577e-15,0.09440671222808052,-2.1359961004765078e-16,2.628334688607366e-15 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv new file mode 100644 index 000000000..12fea80fd --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv @@ -0,0 +1,3 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (QUESTION_1),-0.4640000000000001,0.0,-0.4640000000000002,-0.46399999999999997 +QUESTION_2,1.0061396160665481e-16,0.22901607233412186,-6.491000204344407e-17,2.661379252567537e-16 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv new file mode 100644 index 000000000..2da86fc8a --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv @@ -0,0 +1,5 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (Vulcan),1.113581714285715,1.8337863271908833e-05,0.6327358647132078,1.5944275638582222 +Klingon,-0.4469172932330826,0.2053355684667301,-1.1446017808907327,0.2507671944245675 +Esperanto,-0.33135238095238073,0.37762537621397396,-1.0762755679472913,0.4135708060425298 +Navi,-0.5868571428571425,0.12803165113359088,-1.3471411869010543,0.1734269011867694 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv new file mode 100644 index 000000000..359081b96 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv @@ -0,0 +1,3 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (QUESTION_1),0.8512959999999999,7.482596343683655e-06,0.5014640810687143,1.2011279189312856 +QUESTION_2,-0.12344827586206905,0.6494154859715497,-0.6630644751574266,0.41616792343328846 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv new file mode 100644 index 000000000..55152bc56 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv @@ -0,0 +1,5 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (Vulcan),0.10742857142857143,0.5934146767315952,-0.29244520951067887,0.5073023523678217 +Klingon,-0.15037593984962397,0.6064812844970744,-0.7305738020758628,0.4298219223766149 +Esperanto,-0.10476190476190476,0.7366472332129983,-0.7242437024269881,0.5147198929031787 +Navi,-0.21428571428571414,0.5008872399945372,-0.8465416764613416,0.41797024788991327 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv new file mode 100644 index 000000000..2521c223b --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv @@ -0,0 +1,3 @@ +,estimate,P>[t],[0.025,0.975] +Intercept (QUESTION_1),0.03600000000000004,0.8024137046773936,-0.2500305373954332,0.32203053739543325 +QUESTION_2,-0.0862068965517241,0.6977724389285981,-0.5274093801422773,0.35499558703882916 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv new file mode 100644 index 000000000..67d1afe53 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv @@ -0,0 +1,4 @@ +,N,h_mean,h_sd,h_min,h_max,sys_mean,sys_sd,sys_min,sys_max,corr,wtkappa,R2,kappa,exact_agr,adj_agr,SMD,RMSE +raw,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266 +raw_trim,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266 +raw_trim_round,69,3.536231884057971,0.9006486248471405,1.0,5.0,4.0,0.0,4.0,4.0,,0.0,-0.2690488702049396,0.0,49.275362318840585,85.5072463768116,0.5149268018043555,1.0072203103706698 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv new file mode 100644 index 000000000..32d98faac --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv @@ -0,0 +1,6 @@ +,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round +All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116, +Esperanto,15.0,3.533333333333333,0.9154754164341269,3.5360000000000005,4.596760034896314e-16,-1.3446370462899402e-31,,,0.884437297570231,-9.090909091069577e-06,4.0,0.0,0.0,40.0,86.66666666666667, +Klingon,19.0,3.5789473684210527,0.837707816583391,3.536,0.0,0.0,,,0.8164952057744317,-0.0027744000000000657,4.0,0.0,0.0,47.368421052631575,89.47368421052632, +Navi,14.0,3.642857142857143,0.744946343668492,3.5360000000000005,4.608531526730982e-16,-1.0697660707751459e-31,,,0.7257579289464026,-0.022158574257425956,4.0,0.0,0.0,57.14285714285714,92.85714285714286, +Vulcan,21.0,3.4285714285714284,1.0757057484009542,3.536,0.0,0.0,,,1.0552638126486258,-0.010472296296296202,4.0,0.0,0.0,52.38095238095239,76.19047619047619, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv new file mode 100644 index 000000000..ad201a0b8 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv @@ -0,0 +1,4 @@ +,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round +All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116, +QUESTION_1,40.0,3.5,0.9336995618478525,3.536,0.0,0.0,,,0.9226570327050024,-0.0015247058823528725,4.0,0.0,0.0,47.5,85.0, +QUESTION_2,29.0,3.586206896551724,0.8667361346416773,3.5360000000000005,4.519498061120574e-16,7.474667922401844e-32,,,0.8531399206097032,-0.003475304918032718,4.0,0.0,0.0,51.724137931034484,86.20689655172413, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv new file mode 100644 index 000000000..89060c047 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv @@ -0,0 +1,2 @@ +,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,SMD.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,SMD.raw_trim_round +0,69,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,-0.0002574634009018279,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,0.5149268018043555 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv new file mode 100644 index 000000000..186e4a94d --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv @@ -0,0 +1,3 @@ +,Overall score accuracy,Overall score difference,Conditional score difference,base_category +R2,-0.002410098148508011,-0.03785877645323943,0.0,Vulcan +sig,0.42391077866056215,0.9141656650925646,1.0,Vulcan diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv new file mode 100644 index 000000000..b3a1232cb --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv @@ -0,0 +1,3 @@ +,Overall score accuracy,Overall score difference,Conditional score difference,base_category +R2,-0.01177664867159045,-0.01262654058967505,0.0,QUESTION_1 +sig,0.6494154859715568,0.6977724389286026,1.5088907880804768e-22,QUESTION_1 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv new file mode 100644 index 000000000..429301b68 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv @@ -0,0 +1,70 @@ +spkitemid,sc1,raw,raw_trim,raw_trim_round +RESPONSE_1,4.0,3.536,3.536,4 +RESPONSE_2,4.0,3.536,3.536,4 +RESPONSE_3,4.0,3.536,3.536,4 +RESPONSE_4,3.0,3.536,3.536,4 +RESPONSE_5,2.0,3.536,3.536,4 +RESPONSE_6,2.0,3.536,3.536,4 +RESPONSE_7,4.0,3.536,3.536,4 +RESPONSE_8,2.0,3.536,3.536,4 +RESPONSE_9,5.0,3.536,3.536,4 +RESPONSE_10,3.0,3.536,3.536,4 +RESPONSE_11,4.0,3.536,3.536,4 +RESPONSE_12,5.0,3.536,3.536,4 +RESPONSE_13,4.0,3.536,3.536,4 +RESPONSE_14,4.0,3.536,3.536,4 +RESPONSE_15,3.0,3.536,3.536,4 +RESPONSE_16,4.0,3.536,3.536,4 +RESPONSE_17,4.0,3.536,3.536,4 +RESPONSE_18,3.0,3.536,3.536,4 +RESPONSE_19,2.0,3.536,3.536,4 +RESPONSE_20,4.0,3.536,3.536,4 +RESPONSE_21,4.0,3.536,3.536,4 +RESPONSE_22,5.0,3.536,3.536,4 +RESPONSE_23,3.0,3.536,3.536,4 +RESPONSE_24,3.0,3.536,3.536,4 +RESPONSE_25,4.0,3.536,3.536,4 +RESPONSE_26,4.0,3.536,3.536,4 +RESPONSE_27,4.0,3.536,3.536,4 +RESPONSE_28,4.0,3.536,3.536,4 +RESPONSE_29,3.0,3.536,3.536,4 +RESPONSE_30,3.0,3.536,3.536,4 +RESPONSE_31,3.0,3.536,3.536,4 +RESPONSE_32,3.0,3.536,3.536,4 +RESPONSE_33,1.0,3.536,3.536,4 +RESPONSE_34,4.0,3.536,3.536,4 +RESPONSE_35,4.0,3.536,3.536,4 +RESPONSE_36,3.0,3.536,3.536,4 +RESPONSE_37,4.0,3.536,3.536,4 +RESPONSE_38,5.0,3.536,3.536,4 +RESPONSE_39,2.0,3.536,3.536,4 +RESPONSE_40,4.0,3.536,3.536,4 +RESPONSE_41,4.0,3.536,3.536,4 +RESPONSE_42,4.0,3.536,3.536,4 +RESPONSE_43,4.0,3.536,3.536,4 +RESPONSE_44,3.0,3.536,3.536,4 +RESPONSE_45,2.0,3.536,3.536,4 +RESPONSE_46,2.0,3.536,3.536,4 +RESPONSE_47,4.0,3.536,3.536,4 +RESPONSE_48,2.0,3.536,3.536,4 +RESPONSE_49,5.0,3.536,3.536,4 +RESPONSE_50,3.0,3.536,3.536,4 +RESPONSE_51,4.0,3.536,3.536,4 +RESPONSE_52,5.0,3.536,3.536,4 +RESPONSE_53,4.0,3.536,3.536,4 +RESPONSE_54,4.0,3.536,3.536,4 +RESPONSE_55,3.0,3.536,3.536,4 +RESPONSE_56,4.0,3.536,3.536,4 +RESPONSE_57,4.0,3.536,3.536,4 +RESPONSE_58,3.0,3.536,3.536,4 +RESPONSE_59,2.0,3.536,3.536,4 +RESPONSE_60,4.0,3.536,3.536,4 +RESPONSE_61,4.0,3.536,3.536,4 +RESPONSE_62,5.0,3.536,3.536,4 +RESPONSE_63,3.0,3.536,3.536,4 +RESPONSE_64,3.0,3.536,3.536,4 +RESPONSE_65,4.0,3.536,3.536,4 +RESPONSE_66,4.0,3.536,3.536,4 +RESPONSE_67,4.0,3.536,3.536,4 +RESPONSE_68,4.0,3.536,3.536,4 +RESPONSE_69,3.0,3.536,3.536,4 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv new file mode 100644 index 000000000..91f78b5fc --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv @@ -0,0 +1,6 @@ +,score,human,sys_raw,difference +1.0,1.0,1.4492753623188406,0.0,-1.4492753623188406 +2.0,2.0,13.043478260869565,0.0,-13.043478260869565 +3.0,3.0,26.08695652173913,0.0,-26.08695652173913 +4.0,4.0,49.275362318840585,100.0,50.724637681159415 +5.0,5.0,10.144927536231885,0.0,-10.144927536231885 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv new file mode 100644 index 000000000..cb4ef7ec4 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv @@ -0,0 +1,4 @@ +numeric system score,non-numeric system score +-,0 +0,0 +0,0 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv new file mode 100644 index 000000000..728c608d1 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv @@ -0,0 +1,70 @@ +spkitemid,sc1,sc2 +RESPONSE_1,4.0,4.0 +RESPONSE_2,4.0,4.0 +RESPONSE_3,4.0,3.0 +RESPONSE_4,3.0,4.0 +RESPONSE_5,2.0,3.0 +RESPONSE_6,2.0,2.0 +RESPONSE_7,4.0,4.0 +RESPONSE_8,2.0,2.0 +RESPONSE_9,5.0,4.0 +RESPONSE_10,3.0,3.0 +RESPONSE_11,4.0,4.0 +RESPONSE_12,5.0,5.0 +RESPONSE_13,4.0,3.0 +RESPONSE_14,4.0,4.0 +RESPONSE_15,3.0,4.0 +RESPONSE_16,4.0,4.0 +RESPONSE_17,4.0,4.0 +RESPONSE_18,3.0,3.0 +RESPONSE_19,2.0,3.0 +RESPONSE_20,4.0,4.0 +RESPONSE_21,4.0,3.0 +RESPONSE_22,5.0,5.0 +RESPONSE_23,3.0,3.0 +RESPONSE_24,3.0,3.0 +RESPONSE_25,4.0,4.0 +RESPONSE_26,4.0,4.0 +RESPONSE_27,4.0,4.0 +RESPONSE_28,4.0,4.0 +RESPONSE_29,3.0,3.0 +RESPONSE_30,3.0,3.0 +RESPONSE_31,3.0,3.0 +RESPONSE_32,3.0,3.0 +RESPONSE_33,1.0,1.0 +RESPONSE_34,4.0,4.0 +RESPONSE_35,4.0,3.0 +RESPONSE_36,3.0,3.0 +RESPONSE_37,4.0,4.0 +RESPONSE_38,5.0,5.0 +RESPONSE_39,2.0,1.0 +RESPONSE_40,4.0,3.0 +RESPONSE_41,4.0,4.0 +RESPONSE_42,4.0,4.0 +RESPONSE_43,4.0,3.0 +RESPONSE_44,3.0,4.0 +RESPONSE_45,2.0,3.0 +RESPONSE_46,2.0,2.0 +RESPONSE_47,4.0,4.0 +RESPONSE_48,2.0,2.0 +RESPONSE_49,5.0,4.0 +RESPONSE_50,3.0, +RESPONSE_51,4.0, +RESPONSE_52,5.0, +RESPONSE_53,4.0, +RESPONSE_54,4.0, +RESPONSE_55,3.0, +RESPONSE_56,4.0, +RESPONSE_57,4.0, +RESPONSE_58,3.0, +RESPONSE_59,2.0, +RESPONSE_60,4.0, +RESPONSE_61,4.0, +RESPONSE_62,5.0, +RESPONSE_63,3.0, +RESPONSE_64,3.0, +RESPONSE_65,4.0, +RESPONSE_66,4.0, +RESPONSE_67,4.0, +RESPONSE_68,4.0, +RESPONSE_69,3.0, diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv new file mode 100644 index 000000000..3aeaf1ef5 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv @@ -0,0 +1,70 @@ +spkitemid,QUESTION,L1 +RESPONSE_1,QUESTION_1,Vulcan +RESPONSE_2,QUESTION_1,Esperanto +RESPONSE_3,QUESTION_1,Vulcan +RESPONSE_4,QUESTION_1,Navi +RESPONSE_5,QUESTION_1,Vulcan +RESPONSE_6,QUESTION_1,Klingon +RESPONSE_7,QUESTION_1,Klingon +RESPONSE_8,QUESTION_1,Navi +RESPONSE_9,QUESTION_1,Esperanto +RESPONSE_10,QUESTION_1,Esperanto +RESPONSE_11,QUESTION_1,Klingon +RESPONSE_12,QUESTION_1,Esperanto +RESPONSE_13,QUESTION_1,Esperanto +RESPONSE_14,QUESTION_1,Navi +RESPONSE_15,QUESTION_1,Vulcan +RESPONSE_16,QUESTION_1,Vulcan +RESPONSE_17,QUESTION_1,Navi +RESPONSE_18,QUESTION_1,Esperanto +RESPONSE_19,QUESTION_1,Klingon +RESPONSE_20,QUESTION_1,Navi +RESPONSE_21,QUESTION_1,Vulcan +RESPONSE_22,QUESTION_1,Klingon +RESPONSE_23,QUESTION_1,Esperanto +RESPONSE_24,QUESTION_1,Klingon +RESPONSE_25,QUESTION_1,Navi +RESPONSE_26,QUESTION_1,Klingon +RESPONSE_27,QUESTION_1,Navi +RESPONSE_28,QUESTION_1,Vulcan +RESPONSE_29,QUESTION_1,Esperanto +RESPONSE_30,QUESTION_1,Navi +RESPONSE_31,QUESTION_1,Vulcan +RESPONSE_32,QUESTION_1,Klingon +RESPONSE_33,QUESTION_1,Vulcan +RESPONSE_34,QUESTION_1,Vulcan +RESPONSE_35,QUESTION_1,Vulcan +RESPONSE_36,QUESTION_1,Navi +RESPONSE_37,QUESTION_1,Klingon +RESPONSE_38,QUESTION_1,Vulcan +RESPONSE_39,QUESTION_1,Vulcan +RESPONSE_40,QUESTION_1,Vulcan +RESPONSE_41,QUESTION_2,Navi +RESPONSE_42,QUESTION_2,Esperanto +RESPONSE_43,QUESTION_2,Esperanto +RESPONSE_44,QUESTION_2,Klingon +RESPONSE_45,QUESTION_2,Vulcan +RESPONSE_46,QUESTION_2,Esperanto +RESPONSE_47,QUESTION_2,Klingon +RESPONSE_48,QUESTION_2,Esperanto +RESPONSE_49,QUESTION_2,Navi +RESPONSE_50,QUESTION_2,Vulcan +RESPONSE_51,QUESTION_2,Klingon +RESPONSE_52,QUESTION_2,Klingon +RESPONSE_53,QUESTION_2,Klingon +RESPONSE_54,QUESTION_2,Vulcan +RESPONSE_55,QUESTION_2,Klingon +RESPONSE_56,QUESTION_2,Vulcan +RESPONSE_57,QUESTION_2,Esperanto +RESPONSE_58,QUESTION_2,Esperanto +RESPONSE_59,QUESTION_2,Vulcan +RESPONSE_60,QUESTION_2,Navi +RESPONSE_61,QUESTION_2,Esperanto +RESPONSE_62,QUESTION_2,Vulcan +RESPONSE_63,QUESTION_2,Klingon +RESPONSE_64,QUESTION_2,Klingon +RESPONSE_65,QUESTION_2,Klingon +RESPONSE_66,QUESTION_2,Navi +RESPONSE_67,QUESTION_2,Vulcan +RESPONSE_68,QUESTION_2,Klingon +RESPONSE_69,QUESTION_2,Navi diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv new file mode 100644 index 000000000..32e11246f --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv @@ -0,0 +1,70 @@ +spkitemid +RESPONSE_1 +RESPONSE_2 +RESPONSE_3 +RESPONSE_4 +RESPONSE_5 +RESPONSE_6 +RESPONSE_7 +RESPONSE_8 +RESPONSE_9 +RESPONSE_10 +RESPONSE_11 +RESPONSE_12 +RESPONSE_13 +RESPONSE_14 +RESPONSE_15 +RESPONSE_16 +RESPONSE_17 +RESPONSE_18 +RESPONSE_19 +RESPONSE_20 +RESPONSE_21 +RESPONSE_22 +RESPONSE_23 +RESPONSE_24 +RESPONSE_25 +RESPONSE_26 +RESPONSE_27 +RESPONSE_28 +RESPONSE_29 +RESPONSE_30 +RESPONSE_31 +RESPONSE_32 +RESPONSE_33 +RESPONSE_34 +RESPONSE_35 +RESPONSE_36 +RESPONSE_37 +RESPONSE_38 +RESPONSE_39 +RESPONSE_40 +RESPONSE_41 +RESPONSE_42 +RESPONSE_43 +RESPONSE_44 +RESPONSE_45 +RESPONSE_46 +RESPONSE_47 +RESPONSE_48 +RESPONSE_49 +RESPONSE_50 +RESPONSE_51 +RESPONSE_52 +RESPONSE_53 +RESPONSE_54 +RESPONSE_55 +RESPONSE_56 +RESPONSE_57 +RESPONSE_58 +RESPONSE_59 +RESPONSE_60 +RESPONSE_61 +RESPONSE_62 +RESPONSE_63 +RESPONSE_64 +RESPONSE_65 +RESPONSE_66 +RESPONSE_67 +RESPONSE_68 +RESPONSE_69 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv new file mode 100644 index 000000000..3e6a3b5d4 --- /dev/null +++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv @@ -0,0 +1,4 @@ +,N,N_single,N_double,h1_var_single,h1_var_double,h2_var_double,true_var,mse_true,prmse_true,sys_var_double,sys_var_single +raw,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.6418279958491871,0.014717760578803696,0.0,0.0 +raw_trim,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.6418279958491871,0.014717760578803696,0.0,0.0 +raw_trim_round,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.9062608094085092,-0.3912180297032344,0.0,0.0 diff --git a/tests/test_experiment_rsmeval.py b/tests/test_experiment_rsmeval.py index 4fba5cc91..054490927 100644 --- a/tests/test_experiment_rsmeval.py +++ b/tests/test_experiment_rsmeval.py @@ -50,6 +50,8 @@ param('lr-eval-with-subset-double-scored', 'lr_eval_with_subset_double_scored', consistency=True), param('lr-eval-with-trim-tolerance', 'lr_evaluation_with_trim_tolerance'), param('lr-eval-with-numeric-threshold', 'lr_evaluation_with_numeric_threshold', subgroups=['QUESTION']), + param('lr-eval-system-score-constant', 'lr_eval_system_score_constant', + subgroups=['QUESTION', 'L1'], consistency=True, suppress_warnings_for=[UserWarning]) ]) def test_run_experiment_parameterized(*args, **kwargs): if TEST_DIR: From 8626fff2070b446f64f3641891d7db5c4713a06f Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 14:12:52 -0400 Subject: [PATCH 08/13] Added top level warning about the number of warnings --- rsmtool/analyzer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py index f9c52b71f..c3e316f9b 100644 --- a/rsmtool/analyzer.py +++ b/rsmtool/analyzer.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +import warnings from functools import partial @@ -1198,6 +1199,17 @@ def compute_metrics_by_group(self, for col in df_test.columns if col not in ['spkitemid', grouping_variable]} + # check if any of the standard deviations is zero and + # tell user to expect to see many warnings. + zero_sd_scores = [score for (score, sd) in population_sd_dict.items() if + np.isclose(sd, 0, atol=1e-07)] + if len(zero_sd_scores) > 0: + warnings.warn("The standard deviation for {} scores " + "is zero (all scores are the same). You " + "will see multiple warnings about DSM computation " + "since this metrics is computed separately for " + "each subgroup.".format(', '.join(zero_sd_scores))) + # create a duplicate data frame to compute evaluations # over the whole data, i.e., across groups df_preds_all = df_test.copy() From ec61cd727a21b0d7140f0df04677982d50529387 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 14:14:20 -0400 Subject: [PATCH 09/13] Apply suggestions from code review Co-Authored-By: Matt Mulholland Co-Authored-By: Nitin Madnani --- rsmtool/utils/metrics.py | 2 +- tests/test_utils.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py index b4bb5affd..960beb5cd 100644 --- a/rsmtool/utils/metrics.py +++ b/rsmtool/utils/metrics.py @@ -371,7 +371,7 @@ def difference_of_standardized_means(y_true_observed, # if the population means and standard deviations were not provided, calculate from the data # We only check for mean since the function requires - # both of these to be set of both to be None + # both of these to be set or both to be None if population_y_true_observed_mn is None: warnings.warn(warning_msg.format('y_true_observed')) diff --git a/tests/test_utils.py b/tests/test_utils.py index 5b0c8417c..4a6bdc9c9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -428,10 +428,10 @@ def test_difference_of_standardized_means_zero_population_sd_pred(): np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2])) expected = None diff_std_means = difference_of_standardized_means(y_true, y_pred, - population_y_true_observed_mn=2.44, - population_y_true_observed_sd=0.54, - population_y_pred_mn=2.44, - population_y_pred_sd=0) + population_y_true_observed_mn=2.44, + population_y_true_observed_sd=0.54, + population_y_pred_mn=2.44, + population_y_pred_sd=0) eq_(diff_std_means, expected) @@ -440,10 +440,10 @@ def test_difference_of_standardized_means_zero_population_sd_human(): np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2])) expected = None diff_std_means = difference_of_standardized_means(y_true, y_pred, - population_y_pred_mn=2.44, - population_y_pred_sd=0.54, - population_y_true_observed_mn=2.44, - population_y_true_observed_sd=0) + population_y_pred_mn=2.44, + population_y_pred_sd=0.54, + population_y_true_observed_mn=2.44, + population_y_true_observed_sd=0) eq_(diff_std_means, expected) From ab96e80f078a70ec49385308dae85c2628794242 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 14:15:28 -0400 Subject: [PATCH 10/13] Missing test data file --- ...re_with_subgroups_subset_double_scored.csv | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv diff --git a/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv b/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv new file mode 100644 index 000000000..ab7d9fb03 --- /dev/null +++ b/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv @@ -0,0 +1,70 @@ +id,QUESTION,L1,score,h1,h2 +RESPONSE_1,QUESTION_1,Vulcan,3.536,4,4 +RESPONSE_2,QUESTION_1,Esperanto,3.536,4,4 +RESPONSE_3,QUESTION_1,Vulcan,3.536,4,3 +RESPONSE_4,QUESTION_1,Navi,3.536,3,4 +RESPONSE_5,QUESTION_1,Vulcan,3.536,2,3 +RESPONSE_6,QUESTION_1,Klingon,3.536,2,2 +RESPONSE_7,QUESTION_1,Klingon,3.536,4,4 +RESPONSE_8,QUESTION_1,Navi,3.536,2,2 +RESPONSE_9,QUESTION_1,Esperanto,3.536,5,4 +RESPONSE_10,QUESTION_1,Esperanto,3.536,3,3 +RESPONSE_11,QUESTION_1,Klingon,3.536,4,4 +RESPONSE_12,QUESTION_1,Esperanto,3.536,5,5 +RESPONSE_13,QUESTION_1,Esperanto,3.536,4,3 +RESPONSE_14,QUESTION_1,Navi,3.536,4,4 +RESPONSE_15,QUESTION_1,Vulcan,3.536,3,4 +RESPONSE_16,QUESTION_1,Vulcan,3.536,4,4 +RESPONSE_17,QUESTION_1,Navi,3.536,4,4 +RESPONSE_18,QUESTION_1,Esperanto,3.536,3,3 +RESPONSE_19,QUESTION_1,Klingon,3.536,2,3 +RESPONSE_20,QUESTION_1,Navi,3.536,4,4 +RESPONSE_21,QUESTION_1,Vulcan,3.536,4,3 +RESPONSE_22,QUESTION_1,Klingon,3.536,5,5 +RESPONSE_23,QUESTION_1,Esperanto,3.536,3,3 +RESPONSE_24,QUESTION_1,Klingon,3.536,3,3 +RESPONSE_25,QUESTION_1,Navi,3.536,4,4 +RESPONSE_26,QUESTION_1,Klingon,3.536,4,4 +RESPONSE_27,QUESTION_1,Navi,3.536,4,4 +RESPONSE_28,QUESTION_1,Vulcan,3.536,4,4 +RESPONSE_29,QUESTION_1,Esperanto,3.536,3,3 +RESPONSE_30,QUESTION_1,Navi,3.536,3,3 +RESPONSE_31,QUESTION_1,Vulcan,3.536,3,3 +RESPONSE_32,QUESTION_1,Klingon,3.536,3,3 +RESPONSE_33,QUESTION_1,Vulcan,3.536,1,1 +RESPONSE_34,QUESTION_1,Vulcan,3.536,4,4 +RESPONSE_35,QUESTION_1,Vulcan,3.536,4,3 +RESPONSE_36,QUESTION_1,Navi,3.536,3,3 +RESPONSE_37,QUESTION_1,Klingon,3.536,4,4 +RESPONSE_38,QUESTION_1,Vulcan,3.536,5,5 +RESPONSE_39,QUESTION_1,Vulcan,3.536,2,1 +RESPONSE_40,QUESTION_1,Vulcan,3.536,4,3 +RESPONSE_41,QUESTION_2,Navi,3.536,4,4 +RESPONSE_42,QUESTION_2,Esperanto,3.536,4,4 +RESPONSE_43,QUESTION_2,Esperanto,3.536,4,3 +RESPONSE_44,QUESTION_2,Klingon,3.536,3,4 +RESPONSE_45,QUESTION_2,Vulcan,3.536,2,3 +RESPONSE_46,QUESTION_2,Esperanto,3.536,2,2 +RESPONSE_47,QUESTION_2,Klingon,3.536,4,4 +RESPONSE_48,QUESTION_2,Esperanto,3.536,2,2 +RESPONSE_49,QUESTION_2,Navi,3.536,5,4 +RESPONSE_50,QUESTION_2,Vulcan,3.536,3, +RESPONSE_51,QUESTION_2,Klingon,3.536,4, +RESPONSE_52,QUESTION_2,Klingon,3.536,5, +RESPONSE_53,QUESTION_2,Klingon,3.536,4, +RESPONSE_54,QUESTION_2,Vulcan,3.536,4, +RESPONSE_55,QUESTION_2,Klingon,3.536,3, +RESPONSE_56,QUESTION_2,Vulcan,3.536,4, +RESPONSE_57,QUESTION_2,Esperanto,3.536,4, +RESPONSE_58,QUESTION_2,Esperanto,3.536,3, +RESPONSE_59,QUESTION_2,Vulcan,3.536,2, +RESPONSE_60,QUESTION_2,Navi,3.536,4, +RESPONSE_61,QUESTION_2,Esperanto,3.536,4, +RESPONSE_62,QUESTION_2,Vulcan,3.536,5, +RESPONSE_63,QUESTION_2,Klingon,3.536,3, +RESPONSE_64,QUESTION_2,Klingon,3.536,3, +RESPONSE_65,QUESTION_2,Klingon,3.536,4, +RESPONSE_66,QUESTION_2,Navi,3.536,4, +RESPONSE_67,QUESTION_2,Vulcan,3.536,4, +RESPONSE_68,QUESTION_2,Klingon,3.536,4, +RESPONSE_69,QUESTION_2,Navi,3.536,3, \ No newline at end of file From d1af8da2e2dbf55876e6b7f4c7e96f99cb623518 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 15:08:14 -0400 Subject: [PATCH 11/13] Remove fairness test files --- .../lr_eval_system_score_constant_estimates_csd_by_L1.csv | 5 ----- ..._eval_system_score_constant_estimates_csd_by_QUESTION.csv | 3 --- .../lr_eval_system_score_constant_estimates_osa_by_L1.csv | 5 ----- ..._eval_system_score_constant_estimates_osa_by_QUESTION.csv | 3 --- .../lr_eval_system_score_constant_estimates_osd_by_L1.csv | 5 ----- ..._eval_system_score_constant_estimates_osd_by_QUESTION.csv | 3 --- 6 files changed, 24 deletions(-) delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv deleted file mode 100644 index b4a2497ca..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv +++ /dev/null @@ -1,5 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (Vulcan),-0.4640000000000002,0.0,-0.4640000000000012,-0.4639999999999992 -Klingon,1.2385925618474403e-15,0.06294831850852728,-6.89666061407639e-17,2.5461517298356442e-15 -Esperanto,3.0531133177191805e-16,0.6631333657891718,-1.089425140581065e-15,1.7000478041249011e-15 -Navi,1.2073675392798577e-15,0.09440671222808052,-2.1359961004765078e-16,2.628334688607366e-15 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv deleted file mode 100644 index 12fea80fd..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv +++ /dev/null @@ -1,3 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (QUESTION_1),-0.4640000000000001,0.0,-0.4640000000000002,-0.46399999999999997 -QUESTION_2,1.0061396160665481e-16,0.22901607233412186,-6.491000204344407e-17,2.661379252567537e-16 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv deleted file mode 100644 index 2da86fc8a..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv +++ /dev/null @@ -1,5 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (Vulcan),1.113581714285715,1.8337863271908833e-05,0.6327358647132078,1.5944275638582222 -Klingon,-0.4469172932330826,0.2053355684667301,-1.1446017808907327,0.2507671944245675 -Esperanto,-0.33135238095238073,0.37762537621397396,-1.0762755679472913,0.4135708060425298 -Navi,-0.5868571428571425,0.12803165113359088,-1.3471411869010543,0.1734269011867694 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv deleted file mode 100644 index 359081b96..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv +++ /dev/null @@ -1,3 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (QUESTION_1),0.8512959999999999,7.482596343683655e-06,0.5014640810687143,1.2011279189312856 -QUESTION_2,-0.12344827586206905,0.6494154859715497,-0.6630644751574266,0.41616792343328846 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv deleted file mode 100644 index 55152bc56..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv +++ /dev/null @@ -1,5 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (Vulcan),0.10742857142857143,0.5934146767315952,-0.29244520951067887,0.5073023523678217 -Klingon,-0.15037593984962397,0.6064812844970744,-0.7305738020758628,0.4298219223766149 -Esperanto,-0.10476190476190476,0.7366472332129983,-0.7242437024269881,0.5147198929031787 -Navi,-0.21428571428571414,0.5008872399945372,-0.8465416764613416,0.41797024788991327 diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv deleted file mode 100644 index 2521c223b..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv +++ /dev/null @@ -1,3 +0,0 @@ -,estimate,P>[t],[0.025,0.975] -Intercept (QUESTION_1),0.03600000000000004,0.8024137046773936,-0.2500305373954332,0.32203053739543325 -QUESTION_2,-0.0862068965517241,0.6977724389285981,-0.5274093801422773,0.35499558703882916 From d4c75de8a61e78f4b38a4be815d5b19d5031497a Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Thu, 19 Mar 2020 15:23:23 -0400 Subject: [PATCH 12/13] Remove few more fairness files --- .../lr_eval_system_score_constant_fairness_metrics_by_L1.csv | 3 --- ...eval_system_score_constant_fairness_metrics_by_QUESTION.csv | 3 --- 2 files changed, 6 deletions(-) delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv deleted file mode 100644 index 186e4a94d..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Overall score accuracy,Overall score difference,Conditional score difference,base_category -R2,-0.002410098148508011,-0.03785877645323943,0.0,Vulcan -sig,0.42391077866056215,0.9141656650925646,1.0,Vulcan diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv deleted file mode 100644 index b3a1232cb..000000000 --- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Overall score accuracy,Overall score difference,Conditional score difference,base_category -R2,-0.01177664867159045,-0.01262654058967505,0.0,QUESTION_1 -sig,0.6494154859715568,0.6977724389286026,1.5088907880804768e-22,QUESTION_1 From b65a0acfdbc398e0232ae1545bef5e4188f8dc69 Mon Sep 17 00:00:00 2001 From: Anastassia Loukina Date: Fri, 20 Mar 2020 10:42:14 -0400 Subject: [PATCH 13/13] Update rsmtool/analyzer.py Co-Authored-By: Nitin Madnani --- rsmtool/analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py index c3e316f9b..f2f81af31 100644 --- a/rsmtool/analyzer.py +++ b/rsmtool/analyzer.py @@ -1205,9 +1205,9 @@ def compute_metrics_by_group(self, np.isclose(sd, 0, atol=1e-07)] if len(zero_sd_scores) > 0: warnings.warn("The standard deviation for {} scores " - "is zero (all scores are the same). You " + "is zero (all values are the same). You " "will see multiple warnings about DSM computation " - "since this metrics is computed separately for " + "since this metric is computed separately for " "each subgroup.".format(', '.join(zero_sd_scores))) # create a duplicate data frame to compute evaluations