From 34ae6411654eaacddc32a0265db8a2af82e7d208 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 07:45:35 -0400
Subject: [PATCH 01/13] Fixed errors in documentation

---
 rsmtool/analyzer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py
index ddf5016f6..f9c52b71f 100644
--- a/rsmtool/analyzer.py
+++ b/rsmtool/analyzer.py
@@ -601,8 +601,8 @@ def metrics_helper(human_scores,
                        smd_method='unpooled',
                        use_diff_std_means=False):
         """
-        This is a helper function that computes some basic agreement
-        and association metrics between the system scores and the
+        This is a helper function that computes several basic
+        association metrics between the system scores and the
         human scores.
 
         Parameters
@@ -620,16 +620,16 @@ def metrics_helper(human_scores,
         population_system_score_sd : float, optional
             Reference standard deviation for system scores. If `smd_method='williamson'`, this is
             used to compute SMD and should be the standard deviation for the whole population.If
-            `use_diff_std_means=True`, this must be used with `population_human_score_mn`.
+            `use_diff_std_means=True`, this must be used with `population_system_score_mn`.
             Otherwise, it is ignored.
             Defaults to None.
         population_human_score_mn : float, optional
             Reference mean for human scores. If `use_diff_std_means=True`, this must be used with
-            `population_human_score_mn`. Otherwise, it is ignored.
+            `population_human_score_sd`. Otherwise, it is ignored.
             Defaults to None.
         population_system_score_mn : float, optional
             Reference mean for system scores. If  `use_diff_std_means=True`, this must be used with
-            `population_human_score_mn`. Otherwise, it is ignored.
+            `population_system_score_sd`. Otherwise, it is ignored.
             Defaults to None.
         smd_method : {'williamson', 'johnson', pooled', 'unpooled'}, optional
             The SMD method to use, only used if `use_diff_std_means=False`.

From 8207e8b98457df241d3c5c7989d3f4b48a98b91d Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 09:16:19 -0400
Subject: [PATCH 02/13] Correct treatment of zero standard deviation

---
 rsmtool/utils/metrics.py | 13 +++++++++++--
 tests/test_analyzer.py   | 40 ++++++++++++++++++++++++++++++++++++++++
 tests/test_utils.py      | 12 ++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py
index 1f44c4fed..1ccc19058 100644
--- a/rsmtool/utils/metrics.py
+++ b/rsmtool/utils/metrics.py
@@ -356,11 +356,13 @@ def difference_of_standardized_means(y_true_observed,
     y_pred_population_params = [population_y_pred_mn,
                                 population_y_pred_sd]
 
-    if any(y_true_observed_population_params) and not all(y_true_observed_population_params):
+    if len([param for param in y_true_observed_population_params
+            if param is None]) == 1:
         raise ValueError('You must pass both `population_y_true_observed_mn` and '
                          '`population_y_true_observed_sd` or neither.')
 
-    if any(y_pred_population_params) and not all(y_pred_population_params):
+    if len([param for param in y_pred_population_params
+            if param is None]) == 1:
         raise ValueError('You must pass both `population_y_pred_mn` and '
                          '`population_y_pred_sd` or neither.')
 
@@ -382,6 +384,13 @@ def difference_of_standardized_means(y_true_observed,
          population_y_pred_mn) = (np.std(y_pred, ddof=ddof),
                                   np.mean(y_pred))
 
+    # if any of the standard deviations raise a warning and return None
+    if population_y_pred_sd == 0 or population_y_true_observed_sd == 0:
+        warnings.warn("Population standard deviations for the computation of "
+                      "DSM are zero. No value will be computed")
+        return None
+
+
     # calculate the z-scores for observed and predicted
     y_true_observed_subgroup_z = ((y_true_observed - population_y_true_observed_mn) /
                                   population_y_true_observed_sd)
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
index 7ead956cb..fe86a8677 100644
--- a/tests/test_analyzer.py
+++ b/tests/test_analyzer.py
@@ -182,6 +182,46 @@ def test_metrics_helper_population_sds(self):
         assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index())
         assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
 
+
+    def test_metrics_helper_zero_system_sd(self):
+        human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1]
+        system_score = [2.54]*10
+        computed_metrics1 = Analyzer.metrics_helper(human_scores,
+                                                    system_score)
+        expected_metrics1 = pd.Series({'N': 10,
+                                       'R2': -0.015806451612903283,
+                                       'RMSE': 1.122319027727856,
+                                       'SMD':0.11927198519188371,
+                                       'adj_agr': 50.0,
+                                       'corr': None,
+                                       'exact_agr': 0,
+                                       'h_max': 4,
+                                       'h_mean': 2.4,
+                                       'h_min': 1.0,
+                                       'h_sd': 1.1737877907772674,
+                                       'kappa': 0,
+                                       'sys_max': 2.54,
+                                       'sys_mean': 2.54,
+                                       'sys_min': 2.54,
+                                       'sys_sd': 0,
+                                       'wtkappa': 0})
+        # now compute DSM
+        computed_metrics2 = Analyzer.metrics_helper(human_scores,
+                                                    system_score,
+                                                    use_diff_std_means=True)
+
+        # the only number that should change is the SMD
+        expected_metrics2 = expected_metrics1.copy()
+        expected_metrics2.drop("SMD", inplace=True)
+        expected_metrics2['DSM'] = None
+        assert_series_equal(computed_metrics1.sort_index(),
+                            expected_metrics1.sort_index(),
+                            check_dtype=False)
+        assert_series_equal(computed_metrics2.sort_index(),
+                            expected_metrics2.sort_index(),
+                            check_dtype=False)
+
+
     def test_compute_pca_less_samples_than_features(self):
         # test pca when we have less samples than
         # features. In this case the number of components
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d0481741b..115684a31 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -423,6 +423,18 @@ def test_difference_of_standardized_means_with_no_population_info():
     assert issubclass(warning_list[1].category, UserWarning)
 
 
+def test_difference_of_standardized_means_zero_population_sd():
+    y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
+                      np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))
+    expected = None
+    diff_std_means = difference_of_standardized_means(y_true, y_pred,
+                                                     population_y_true_observed_mn=2.44,
+                                                     population_y_true_observed_sd=0.54,
+                                                     population_y_pred_mn=2.44,
+                                                     population_y_pred_sd=0)
+    eq_(diff_std_means, expected)
+
+
 def test_quadratic_weighted_kappa():
 
     expected_qwk = -0.09210526315789469

From fe68b222fc0be6c29aab3d94897f85824d955871 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 09:17:02 -0400
Subject: [PATCH 03/13] Correct treatment of zero standard deviation

---
 tests/test_utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 115684a31..89d3ae3f0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -423,7 +423,7 @@ def test_difference_of_standardized_means_with_no_population_info():
     assert issubclass(warning_list[1].category, UserWarning)
 
 
-def test_difference_of_standardized_means_zero_population_sd():
+def test_difference_of_standardized_means_zero_population_sd_pred():
     y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
                       np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))
     expected = None
@@ -435,6 +435,18 @@ def test_difference_of_standardized_means_zero_population_sd():
     eq_(diff_std_means, expected)
 
 
+def test_difference_of_standardized_means_zero_population_sd_human():
+    y_pred, y_true = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
+                      np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))
+    expected = None
+    diff_std_means = difference_of_standardized_means(y_true, y_pred,
+                                                     population_y_pred_observed_mn=2.44,
+                                                     population_y_pred_observed_sd=0.54,
+                                                     population_y_true_mn=2.44,
+                                                     population_y_true_sd=0)
+    eq_(diff_std_means, expected)
+
+
 def test_quadratic_weighted_kappa():
 
     expected_qwk = -0.09210526315789469

From 3eb75652b4b56550b3f2777cea5fe7bc91a735e8 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 09:21:31 -0400
Subject: [PATCH 04/13] Few more fixes

---
 rsmtool/utils/metrics.py |  8 +++++---
 tests/test_utils.py      | 21 +++++++++++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py
index 1ccc19058..1bfb0cb8a 100644
--- a/rsmtool/utils/metrics.py
+++ b/rsmtool/utils/metrics.py
@@ -370,14 +370,16 @@ def difference_of_standardized_means(y_true_observed,
                    'thus, the calculated z-scores will be zero.')
 
     # if the population means and standard deviations were not provided, calculate from the data
-    if not population_y_true_observed_mn or not population_y_true_observed_sd:
+    # We only check for mean since the function requires
+    # both of these to be set of both to be None
+    if population_y_true_observed_mn is None:
 
         warnings.warn(warning_msg.format('y_true_observed'))
         (population_y_true_observed_sd,
          population_y_true_observed_mn) = (np.std(y_true_observed, ddof=ddof),
                                            np.mean(y_true_observed))
 
-    if not population_y_pred_mn or not population_y_pred_sd:
+    if population_y_pred_mn is None:
 
         warnings.warn(warning_msg.format('y_pred'))
         (population_y_pred_sd,
@@ -387,7 +389,7 @@ def difference_of_standardized_means(y_true_observed,
     # if any of the standard deviations raise a warning and return None
     if population_y_pred_sd == 0 or population_y_true_observed_sd == 0:
         warnings.warn("Population standard deviations for the computation of "
-                      "DSM are zero. No value will be computed")
+                      "DSM is zero. No value will be computed.")
         return None
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 89d3ae3f0..5b0c8417c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -425,7 +425,7 @@ def test_difference_of_standardized_means_with_no_population_info():
 
 def test_difference_of_standardized_means_zero_population_sd_pred():
     y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
-                      np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))
+                      np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2]))
     expected = None
     diff_std_means = difference_of_standardized_means(y_true, y_pred,
                                                      population_y_true_observed_mn=2.44,
@@ -436,14 +436,23 @@ def test_difference_of_standardized_means_zero_population_sd_pred():
 
 
 def test_difference_of_standardized_means_zero_population_sd_human():
+    y_true, y_pred = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
+                      np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2]))
+    expected = None
+    diff_std_means = difference_of_standardized_means(y_true, y_pred,
+                                                     population_y_pred_mn=2.44,
+                                                     population_y_pred_sd=0.54,
+                                                     population_y_true_observed_mn=2.44,
+                                                     population_y_true_observed_sd=0)
+    eq_(diff_std_means, expected)
+
+
+def test_difference_of_standardized_means_zero_population_computed():
+    # sd is computed from the data and is zero
     y_pred, y_true = (np.array([3, 5, 1, 2, 2, 3, 1, 4, 1, 2]),
                       np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))
     expected = None
-    diff_std_means = difference_of_standardized_means(y_true, y_pred,
-                                                     population_y_pred_observed_mn=2.44,
-                                                     population_y_pred_observed_sd=0.54,
-                                                     population_y_true_mn=2.44,
-                                                     population_y_true_sd=0)
+    diff_std_means = difference_of_standardized_means(y_true, y_pred)
     eq_(diff_std_means, expected)
 
 

From 21134fe55b4e6e7752e679be6814dec1d755d85a Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 12:19:29 -0400
Subject: [PATCH 05/13] PEP8 fixes

---
 tests/test_analyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
index fe86a8677..c33a9aa72 100644
--- a/tests/test_analyzer.py
+++ b/tests/test_analyzer.py
@@ -185,13 +185,13 @@ def test_metrics_helper_population_sds(self):
 
     def test_metrics_helper_zero_system_sd(self):
         human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1]
-        system_score = [2.54]*10
+        system_score = [2.54] * 10
         computed_metrics1 = Analyzer.metrics_helper(human_scores,
                                                     system_score)
         expected_metrics1 = pd.Series({'N': 10,
                                        'R2': -0.015806451612903283,
                                        'RMSE': 1.122319027727856,
-                                       'SMD':0.11927198519188371,
+                                       'SMD': 0.11927198519188371,
                                        'adj_agr': 50.0,
                                        'corr': None,
                                        'exact_agr': 0,

From dbbab029baa19232c398389a766b200ce26cc460 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 12:21:36 -0400
Subject: [PATCH 06/13] Minor docstring changes

---
 rsmtool/utils/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py
index 1bfb0cb8a..26ba78480 100644
--- a/rsmtool/utils/metrics.py
+++ b/rsmtool/utils/metrics.py
@@ -386,7 +386,8 @@ def difference_of_standardized_means(y_true_observed,
          population_y_pred_mn) = (np.std(y_pred, ddof=ddof),
                                   np.mean(y_pred))
 
-    # if any of the standard deviations raise a warning and return None
+    # if any of the standard deviations equal zero
+    # raise a warning and return None
     if population_y_pred_sd == 0 or population_y_true_observed_sd == 0:
         warnings.warn("Population standard deviations for the computation of "
                       "DSM is zero. No value will be computed.")

From 85d6b04f02203ff406982edaca69e95a9d15e246 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 13:53:45 -0400
Subject: [PATCH 07/13] New test files

---
 rsmtool/utils/metrics.py                      | 10 ++-
 .../lr_eval_system_score_constant.json        | 12 ++++
 ..._eval_system_score_constant_confMatrix.csv |  6 ++
 ...eval_system_score_constant_consistency.csv |  2 +
 ...ystem_score_constant_consistency_by_L1.csv |  6 ++
 ...score_constant_consistency_by_QUESTION.csv |  4 ++
 ...system_score_constant_data_composition.csv |  2 +
 ..._score_constant_data_composition_by_L1.csv |  5 ++
 ..._constant_data_composition_by_QUESTION.csv |  3 +
 ...eval_system_score_constant_degradation.csv |  4 ++
 ...re_constant_disattenuated_correlations.csv |  4 ++
 ...stant_disattenuated_correlations_by_L1.csv |  6 ++
 ...disattenuated_correlations_by_QUESTION.csv |  4 ++
 ...tem_score_constant_estimates_csd_by_L1.csv |  5 ++
 ...ore_constant_estimates_csd_by_QUESTION.csv |  3 +
 ...tem_score_constant_estimates_osa_by_L1.csv |  5 ++
 ...ore_constant_estimates_osa_by_QUESTION.csv |  3 +
 ...tem_score_constant_estimates_osd_by_L1.csv |  5 ++
 ...ore_constant_estimates_osd_by_QUESTION.csv |  3 +
 .../lr_eval_system_score_constant_eval.csv    |  4 ++
 ..._eval_system_score_constant_eval_by_L1.csv |  6 ++
 ...system_score_constant_eval_by_QUESTION.csv |  4 ++
 ..._eval_system_score_constant_eval_short.csv |  2 +
 ..._score_constant_fairness_metrics_by_L1.csv |  3 +
 ..._constant_fairness_metrics_by_QUESTION.csv |  3 +
 ...l_system_score_constant_pred_processed.csv | 70 +++++++++++++++++++
 ..._eval_system_score_constant_score_dist.csv |  6 ++
 ...ore_constant_test_excluded_composition.csv |  4 ++
 ...ystem_score_constant_test_human_scores.csv | 70 +++++++++++++++++++
 ...al_system_score_constant_test_metadata.csv | 70 +++++++++++++++++++
 ...stem_score_constant_test_other_columns.csv | 70 +++++++++++++++++++
 ..._system_score_constant_true_score_eval.csv |  4 ++
 tests/test_experiment_rsmeval.py              |  2 +
 33 files changed, 408 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv
 create mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv

diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py
index 26ba78480..b4bb5affd 100644
--- a/rsmtool/utils/metrics.py
+++ b/rsmtool/utils/metrics.py
@@ -387,8 +387,14 @@ def difference_of_standardized_means(y_true_observed,
                                   np.mean(y_pred))
 
     # if any of the standard deviations equal zero
-    # raise a warning and return None
-    if population_y_pred_sd == 0 or population_y_true_observed_sd == 0:
+    # raise a warning and return None.
+    # We use np.isclose since sometimes sd for float
+    # values is a value very close to 0.
+    # We use the same tolerance as used for identifying
+    # features with zero standard deviation
+
+    if np.isclose(population_y_pred_sd, 0, atol=1e-07) \
+        or np.isclose(population_y_true_observed_sd, 0, atol=1e-07):
         warnings.warn("Population standard deviations for the computation of "
                       "DSM is zero. No value will be computed.")
         return None
diff --git a/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json b/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json
new file mode 100644
index 000000000..be6ab5228
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/lr_eval_system_score_constant.json
@@ -0,0 +1,12 @@
+{
+    "predictions_file": "../../files/predictions_same_system_score_with_subgroups_subset_double_scored.csv",
+    "system_score_column": "score",
+    "description": "An evaluation of LinearRegression predictions.",
+    "human_score_column": "h1",
+    "second_human_score_column": "h2",
+    "id_column": "id",
+    "experiment_id": "lr_eval_system_score_constant",
+    "subgroups": "QUESTION, L1",
+    "trim_min": 1,
+    "trim_max": 6
+}
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv
new file mode 100644
index 000000000..d12819015
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_confMatrix.csv
@@ -0,0 +1,6 @@
+,1,2,3,4,5
+1,0,0,0,0,0
+2,0,0,0,0,0
+3,0,0,0,0,0
+4,1,9,18,34,7
+5,0,0,0,0,0
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv
new file mode 100644
index 000000000..4ffd4c7b9
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency.csv
@@ -0,0 +1,2 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,SMD
+,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,-0.06622863508167891
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv
new file mode 100644
index 000000000..17956bae6
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_L1.csv
@@ -0,0 +1,6 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
+All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329
+Esperanto,12,3.5,1.0,2.0,5.0,3.25,0.8660254037844386,2.0,5.0,0.8922685978385125,0.85,0.6470588235294117,75.0,100.0,-0.13791461209462752
+Klingon,11,3.4545454545454546,0.9341987329938276,2.0,5.0,3.6363636363636362,0.8090398349558905,2.0,5.0,0.9021097956087901,0.8720930232558141,0.721518987341772,81.81818181818183,100.0,0.34772515873397636
+Navi,11,3.6363636363636362,0.8090398349558905,2.0,5.0,3.6363636363636362,0.6741998624632421,2.0,4.0,0.8333333333333335,0.8196721311475409,0.6666666666666666,81.81818181818183,100.0,0.14585044666295055
+Vulcan,15,3.3333333333333335,1.1126972805283737,1.0,5.0,3.2,1.082325538564332,1.0,5.0,0.771046169254443,0.7647058823529412,0.2857142857142857,46.666666666666664,100.0,-0.009179051765826295
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv
new file mode 100644
index 000000000..7487df3ae
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_consistency_by_QUESTION.csv
@@ -0,0 +1,4 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
+All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329
+QUESTION_1,40,3.5,0.9336995618478525,1.0,5.0,3.425,0.9026314805852884,1.0,5.0,0.8366627923660599,0.8333333333333334,0.5914577530176417,72.5,100.0,0.05919229464910923
+QUESTION_2,9,3.3333333333333335,1.118033988749895,2.0,5.0,3.3333333333333335,0.8660254037844386,2.0,4.0,0.7745966692414833,0.75,0.3207547169811321,55.55555555555556,100.0,0.14099763908654464
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv
new file mode 100644
index 000000000..fab4f010b
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition.csv
@@ -0,0 +1,2 @@
+partition,responses,QUESTION,L1
+Evaluation,69,2,4
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv
new file mode 100644
index 000000000..c3ba54f7f
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_L1.csv
@@ -0,0 +1,5 @@
+L1,N responses
+Esperanto,15
+Klingon,19
+Navi,14
+Vulcan,21
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv
new file mode 100644
index 000000000..9699d98cc
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_data_composition_by_QUESTION.csv
@@ -0,0 +1,3 @@
+QUESTION,N responses
+QUESTION_1,40
+QUESTION_2,29
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv
new file mode 100644
index 000000000..f84bb7da4
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_degradation.csv
@@ -0,0 +1,4 @@
+,corr,kappa,wtkappa,exact_agr,adj_agr,SMD
+raw,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708
+raw_trim,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708
+raw_trim_round,,-0.5490797546012269,-0.8175725986597168,-20.112392783200228,-14.492753623188406,0.5811554368860343
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv
new file mode 100644
index 000000000..135af468a
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations.csv
@@ -0,0 +1,4 @@
+,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
+raw,,0.8218820738677033,0.9065771196471392,
+raw_trim,,0.8218820738677033,0.9065771196471392,
+raw_trim_round,,0.8218820738677033,0.9065771196471392,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv
new file mode 100644
index 000000000..5b78776d8
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_L1.csv
@@ -0,0 +1,6 @@
+,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
+All data,,0.8218820738677033,0.9065771196471392,
+Esperanto,,0.8922685978385125,0.9445997024340589,
+Klingon,,0.9021097956087901,0.9497946070644906,
+Navi,,0.8333333333333335,0.9128709291752769,
+Vulcan,,0.771046169254443,0.8780923466552041,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv
new file mode 100644
index 000000000..bc9e96db5
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_disattenuated_correlations_by_QUESTION.csv
@@ -0,0 +1,4 @@
+,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
+All data,,0.8218820738677033,0.9065771196471392,
+QUESTION_1,,0.8366627923660599,0.9146927311212547,
+QUESTION_2,,0.7745966692414833,0.8801117367933934,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
new file mode 100644
index 000000000..b4a2497ca
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
@@ -0,0 +1,5 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (Vulcan),-0.4640000000000002,0.0,-0.4640000000000012,-0.4639999999999992
+Klingon,1.2385925618474403e-15,0.06294831850852728,-6.89666061407639e-17,2.5461517298356442e-15
+Esperanto,3.0531133177191805e-16,0.6631333657891718,-1.089425140581065e-15,1.7000478041249011e-15
+Navi,1.2073675392798577e-15,0.09440671222808052,-2.1359961004765078e-16,2.628334688607366e-15
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
new file mode 100644
index 000000000..12fea80fd
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
@@ -0,0 +1,3 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (QUESTION_1),-0.4640000000000001,0.0,-0.4640000000000002,-0.46399999999999997
+QUESTION_2,1.0061396160665481e-16,0.22901607233412186,-6.491000204344407e-17,2.661379252567537e-16
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
new file mode 100644
index 000000000..2da86fc8a
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
@@ -0,0 +1,5 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (Vulcan),1.113581714285715,1.8337863271908833e-05,0.6327358647132078,1.5944275638582222
+Klingon,-0.4469172932330826,0.2053355684667301,-1.1446017808907327,0.2507671944245675
+Esperanto,-0.33135238095238073,0.37762537621397396,-1.0762755679472913,0.4135708060425298
+Navi,-0.5868571428571425,0.12803165113359088,-1.3471411869010543,0.1734269011867694
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
new file mode 100644
index 000000000..359081b96
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
@@ -0,0 +1,3 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (QUESTION_1),0.8512959999999999,7.482596343683655e-06,0.5014640810687143,1.2011279189312856
+QUESTION_2,-0.12344827586206905,0.6494154859715497,-0.6630644751574266,0.41616792343328846
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
new file mode 100644
index 000000000..55152bc56
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
@@ -0,0 +1,5 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (Vulcan),0.10742857142857143,0.5934146767315952,-0.29244520951067887,0.5073023523678217
+Klingon,-0.15037593984962397,0.6064812844970744,-0.7305738020758628,0.4298219223766149
+Esperanto,-0.10476190476190476,0.7366472332129983,-0.7242437024269881,0.5147198929031787
+Navi,-0.21428571428571414,0.5008872399945372,-0.8465416764613416,0.41797024788991327
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv
new file mode 100644
index 000000000..2521c223b
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv
@@ -0,0 +1,3 @@
+,estimate,P>[t],[0.025,0.975]
+Intercept (QUESTION_1),0.03600000000000004,0.8024137046773936,-0.2500305373954332,0.32203053739543325
+QUESTION_2,-0.0862068965517241,0.6977724389285981,-0.5274093801422773,0.35499558703882916
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv
new file mode 100644
index 000000000..67d1afe53
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval.csv
@@ -0,0 +1,4 @@
+,N,h_mean,h_sd,h_min,h_max,sys_mean,sys_sd,sys_min,sys_max,corr,wtkappa,R2,kappa,exact_agr,adj_agr,SMD,RMSE
+raw,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266
+raw_trim,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266
+raw_trim_round,69,3.536231884057971,0.9006486248471405,1.0,5.0,4.0,0.0,4.0,4.0,,0.0,-0.2690488702049396,0.0,49.275362318840585,85.5072463768116,0.5149268018043555,1.0072203103706698
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv
new file mode 100644
index 000000000..32d98faac
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_L1.csv
@@ -0,0 +1,6 @@
+,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round
+All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,
+Esperanto,15.0,3.533333333333333,0.9154754164341269,3.5360000000000005,4.596760034896314e-16,-1.3446370462899402e-31,,,0.884437297570231,-9.090909091069577e-06,4.0,0.0,0.0,40.0,86.66666666666667,
+Klingon,19.0,3.5789473684210527,0.837707816583391,3.536,0.0,0.0,,,0.8164952057744317,-0.0027744000000000657,4.0,0.0,0.0,47.368421052631575,89.47368421052632,
+Navi,14.0,3.642857142857143,0.744946343668492,3.5360000000000005,4.608531526730982e-16,-1.0697660707751459e-31,,,0.7257579289464026,-0.022158574257425956,4.0,0.0,0.0,57.14285714285714,92.85714285714286,
+Vulcan,21.0,3.4285714285714284,1.0757057484009542,3.536,0.0,0.0,,,1.0552638126486258,-0.010472296296296202,4.0,0.0,0.0,52.38095238095239,76.19047619047619,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv
new file mode 100644
index 000000000..ad201a0b8
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_by_QUESTION.csv
@@ -0,0 +1,4 @@
+,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round
+All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,
+QUESTION_1,40.0,3.5,0.9336995618478525,3.536,0.0,0.0,,,0.9226570327050024,-0.0015247058823528725,4.0,0.0,0.0,47.5,85.0,
+QUESTION_2,29.0,3.586206896551724,0.8667361346416773,3.5360000000000005,4.519498061120574e-16,7.474667922401844e-32,,,0.8531399206097032,-0.003475304918032718,4.0,0.0,0.0,51.724137931034484,86.20689655172413,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv
new file mode 100644
index 000000000..89060c047
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_eval_short.csv
@@ -0,0 +1,2 @@
+,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,SMD.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,SMD.raw_trim_round
+0,69,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,-0.0002574634009018279,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,0.5149268018043555
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
new file mode 100644
index 000000000..186e4a94d
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
@@ -0,0 +1,3 @@
+,Overall score accuracy,Overall score difference,Conditional score difference,base_category
+R2,-0.002410098148508011,-0.03785877645323943,0.0,Vulcan
+sig,0.42391077866056215,0.9141656650925646,1.0,Vulcan
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv
new file mode 100644
index 000000000..b3a1232cb
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv
@@ -0,0 +1,3 @@
+,Overall score accuracy,Overall score difference,Conditional score difference,base_category
+R2,-0.01177664867159045,-0.01262654058967505,0.0,QUESTION_1
+sig,0.6494154859715568,0.6977724389286026,1.5088907880804768e-22,QUESTION_1
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv
new file mode 100644
index 000000000..429301b68
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_pred_processed.csv
@@ -0,0 +1,70 @@
+spkitemid,sc1,raw,raw_trim,raw_trim_round
+RESPONSE_1,4.0,3.536,3.536,4
+RESPONSE_2,4.0,3.536,3.536,4
+RESPONSE_3,4.0,3.536,3.536,4
+RESPONSE_4,3.0,3.536,3.536,4
+RESPONSE_5,2.0,3.536,3.536,4
+RESPONSE_6,2.0,3.536,3.536,4
+RESPONSE_7,4.0,3.536,3.536,4
+RESPONSE_8,2.0,3.536,3.536,4
+RESPONSE_9,5.0,3.536,3.536,4
+RESPONSE_10,3.0,3.536,3.536,4
+RESPONSE_11,4.0,3.536,3.536,4
+RESPONSE_12,5.0,3.536,3.536,4
+RESPONSE_13,4.0,3.536,3.536,4
+RESPONSE_14,4.0,3.536,3.536,4
+RESPONSE_15,3.0,3.536,3.536,4
+RESPONSE_16,4.0,3.536,3.536,4
+RESPONSE_17,4.0,3.536,3.536,4
+RESPONSE_18,3.0,3.536,3.536,4
+RESPONSE_19,2.0,3.536,3.536,4
+RESPONSE_20,4.0,3.536,3.536,4
+RESPONSE_21,4.0,3.536,3.536,4
+RESPONSE_22,5.0,3.536,3.536,4
+RESPONSE_23,3.0,3.536,3.536,4
+RESPONSE_24,3.0,3.536,3.536,4
+RESPONSE_25,4.0,3.536,3.536,4
+RESPONSE_26,4.0,3.536,3.536,4
+RESPONSE_27,4.0,3.536,3.536,4
+RESPONSE_28,4.0,3.536,3.536,4
+RESPONSE_29,3.0,3.536,3.536,4
+RESPONSE_30,3.0,3.536,3.536,4
+RESPONSE_31,3.0,3.536,3.536,4
+RESPONSE_32,3.0,3.536,3.536,4
+RESPONSE_33,1.0,3.536,3.536,4
+RESPONSE_34,4.0,3.536,3.536,4
+RESPONSE_35,4.0,3.536,3.536,4
+RESPONSE_36,3.0,3.536,3.536,4
+RESPONSE_37,4.0,3.536,3.536,4
+RESPONSE_38,5.0,3.536,3.536,4
+RESPONSE_39,2.0,3.536,3.536,4
+RESPONSE_40,4.0,3.536,3.536,4
+RESPONSE_41,4.0,3.536,3.536,4
+RESPONSE_42,4.0,3.536,3.536,4
+RESPONSE_43,4.0,3.536,3.536,4
+RESPONSE_44,3.0,3.536,3.536,4
+RESPONSE_45,2.0,3.536,3.536,4
+RESPONSE_46,2.0,3.536,3.536,4
+RESPONSE_47,4.0,3.536,3.536,4
+RESPONSE_48,2.0,3.536,3.536,4
+RESPONSE_49,5.0,3.536,3.536,4
+RESPONSE_50,3.0,3.536,3.536,4
+RESPONSE_51,4.0,3.536,3.536,4
+RESPONSE_52,5.0,3.536,3.536,4
+RESPONSE_53,4.0,3.536,3.536,4
+RESPONSE_54,4.0,3.536,3.536,4
+RESPONSE_55,3.0,3.536,3.536,4
+RESPONSE_56,4.0,3.536,3.536,4
+RESPONSE_57,4.0,3.536,3.536,4
+RESPONSE_58,3.0,3.536,3.536,4
+RESPONSE_59,2.0,3.536,3.536,4
+RESPONSE_60,4.0,3.536,3.536,4
+RESPONSE_61,4.0,3.536,3.536,4
+RESPONSE_62,5.0,3.536,3.536,4
+RESPONSE_63,3.0,3.536,3.536,4
+RESPONSE_64,3.0,3.536,3.536,4
+RESPONSE_65,4.0,3.536,3.536,4
+RESPONSE_66,4.0,3.536,3.536,4
+RESPONSE_67,4.0,3.536,3.536,4
+RESPONSE_68,4.0,3.536,3.536,4
+RESPONSE_69,3.0,3.536,3.536,4
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv
new file mode 100644
index 000000000..91f78b5fc
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_score_dist.csv
@@ -0,0 +1,6 @@
+,score,human,sys_raw,difference
+1.0,1.0,1.4492753623188406,0.0,-1.4492753623188406
+2.0,2.0,13.043478260869565,0.0,-13.043478260869565
+3.0,3.0,26.08695652173913,0.0,-26.08695652173913
+4.0,4.0,49.275362318840585,100.0,50.724637681159415
+5.0,5.0,10.144927536231885,0.0,-10.144927536231885
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv
new file mode 100644
index 000000000..cb4ef7ec4
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_excluded_composition.csv
@@ -0,0 +1,4 @@
+numeric system score,non-numeric system score
+-,0
+0,0
+0,0
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv
new file mode 100644
index 000000000..728c608d1
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_human_scores.csv
@@ -0,0 +1,70 @@
+spkitemid,sc1,sc2
+RESPONSE_1,4.0,4.0
+RESPONSE_2,4.0,4.0
+RESPONSE_3,4.0,3.0
+RESPONSE_4,3.0,4.0
+RESPONSE_5,2.0,3.0
+RESPONSE_6,2.0,2.0
+RESPONSE_7,4.0,4.0
+RESPONSE_8,2.0,2.0
+RESPONSE_9,5.0,4.0
+RESPONSE_10,3.0,3.0
+RESPONSE_11,4.0,4.0
+RESPONSE_12,5.0,5.0
+RESPONSE_13,4.0,3.0
+RESPONSE_14,4.0,4.0
+RESPONSE_15,3.0,4.0
+RESPONSE_16,4.0,4.0
+RESPONSE_17,4.0,4.0
+RESPONSE_18,3.0,3.0
+RESPONSE_19,2.0,3.0
+RESPONSE_20,4.0,4.0
+RESPONSE_21,4.0,3.0
+RESPONSE_22,5.0,5.0
+RESPONSE_23,3.0,3.0
+RESPONSE_24,3.0,3.0
+RESPONSE_25,4.0,4.0
+RESPONSE_26,4.0,4.0
+RESPONSE_27,4.0,4.0
+RESPONSE_28,4.0,4.0
+RESPONSE_29,3.0,3.0
+RESPONSE_30,3.0,3.0
+RESPONSE_31,3.0,3.0
+RESPONSE_32,3.0,3.0
+RESPONSE_33,1.0,1.0
+RESPONSE_34,4.0,4.0
+RESPONSE_35,4.0,3.0
+RESPONSE_36,3.0,3.0
+RESPONSE_37,4.0,4.0
+RESPONSE_38,5.0,5.0
+RESPONSE_39,2.0,1.0
+RESPONSE_40,4.0,3.0
+RESPONSE_41,4.0,4.0
+RESPONSE_42,4.0,4.0
+RESPONSE_43,4.0,3.0
+RESPONSE_44,3.0,4.0
+RESPONSE_45,2.0,3.0
+RESPONSE_46,2.0,2.0
+RESPONSE_47,4.0,4.0
+RESPONSE_48,2.0,2.0
+RESPONSE_49,5.0,4.0
+RESPONSE_50,3.0,
+RESPONSE_51,4.0,
+RESPONSE_52,5.0,
+RESPONSE_53,4.0,
+RESPONSE_54,4.0,
+RESPONSE_55,3.0,
+RESPONSE_56,4.0,
+RESPONSE_57,4.0,
+RESPONSE_58,3.0,
+RESPONSE_59,2.0,
+RESPONSE_60,4.0,
+RESPONSE_61,4.0,
+RESPONSE_62,5.0,
+RESPONSE_63,3.0,
+RESPONSE_64,3.0,
+RESPONSE_65,4.0,
+RESPONSE_66,4.0,
+RESPONSE_67,4.0,
+RESPONSE_68,4.0,
+RESPONSE_69,3.0,
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv
new file mode 100644
index 000000000..3aeaf1ef5
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_metadata.csv
@@ -0,0 +1,70 @@
+spkitemid,QUESTION,L1
+RESPONSE_1,QUESTION_1,Vulcan
+RESPONSE_2,QUESTION_1,Esperanto
+RESPONSE_3,QUESTION_1,Vulcan
+RESPONSE_4,QUESTION_1,Navi
+RESPONSE_5,QUESTION_1,Vulcan
+RESPONSE_6,QUESTION_1,Klingon
+RESPONSE_7,QUESTION_1,Klingon
+RESPONSE_8,QUESTION_1,Navi
+RESPONSE_9,QUESTION_1,Esperanto
+RESPONSE_10,QUESTION_1,Esperanto
+RESPONSE_11,QUESTION_1,Klingon
+RESPONSE_12,QUESTION_1,Esperanto
+RESPONSE_13,QUESTION_1,Esperanto
+RESPONSE_14,QUESTION_1,Navi
+RESPONSE_15,QUESTION_1,Vulcan
+RESPONSE_16,QUESTION_1,Vulcan
+RESPONSE_17,QUESTION_1,Navi
+RESPONSE_18,QUESTION_1,Esperanto
+RESPONSE_19,QUESTION_1,Klingon
+RESPONSE_20,QUESTION_1,Navi
+RESPONSE_21,QUESTION_1,Vulcan
+RESPONSE_22,QUESTION_1,Klingon
+RESPONSE_23,QUESTION_1,Esperanto
+RESPONSE_24,QUESTION_1,Klingon
+RESPONSE_25,QUESTION_1,Navi
+RESPONSE_26,QUESTION_1,Klingon
+RESPONSE_27,QUESTION_1,Navi
+RESPONSE_28,QUESTION_1,Vulcan
+RESPONSE_29,QUESTION_1,Esperanto
+RESPONSE_30,QUESTION_1,Navi
+RESPONSE_31,QUESTION_1,Vulcan
+RESPONSE_32,QUESTION_1,Klingon
+RESPONSE_33,QUESTION_1,Vulcan
+RESPONSE_34,QUESTION_1,Vulcan
+RESPONSE_35,QUESTION_1,Vulcan
+RESPONSE_36,QUESTION_1,Navi
+RESPONSE_37,QUESTION_1,Klingon
+RESPONSE_38,QUESTION_1,Vulcan
+RESPONSE_39,QUESTION_1,Vulcan
+RESPONSE_40,QUESTION_1,Vulcan
+RESPONSE_41,QUESTION_2,Navi
+RESPONSE_42,QUESTION_2,Esperanto
+RESPONSE_43,QUESTION_2,Esperanto
+RESPONSE_44,QUESTION_2,Klingon
+RESPONSE_45,QUESTION_2,Vulcan
+RESPONSE_46,QUESTION_2,Esperanto
+RESPONSE_47,QUESTION_2,Klingon
+RESPONSE_48,QUESTION_2,Esperanto
+RESPONSE_49,QUESTION_2,Navi
+RESPONSE_50,QUESTION_2,Vulcan
+RESPONSE_51,QUESTION_2,Klingon
+RESPONSE_52,QUESTION_2,Klingon
+RESPONSE_53,QUESTION_2,Klingon
+RESPONSE_54,QUESTION_2,Vulcan
+RESPONSE_55,QUESTION_2,Klingon
+RESPONSE_56,QUESTION_2,Vulcan
+RESPONSE_57,QUESTION_2,Esperanto
+RESPONSE_58,QUESTION_2,Esperanto
+RESPONSE_59,QUESTION_2,Vulcan
+RESPONSE_60,QUESTION_2,Navi
+RESPONSE_61,QUESTION_2,Esperanto
+RESPONSE_62,QUESTION_2,Vulcan
+RESPONSE_63,QUESTION_2,Klingon
+RESPONSE_64,QUESTION_2,Klingon
+RESPONSE_65,QUESTION_2,Klingon
+RESPONSE_66,QUESTION_2,Navi
+RESPONSE_67,QUESTION_2,Vulcan
+RESPONSE_68,QUESTION_2,Klingon
+RESPONSE_69,QUESTION_2,Navi
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv
new file mode 100644
index 000000000..32e11246f
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_test_other_columns.csv
@@ -0,0 +1,70 @@
+spkitemid
+RESPONSE_1
+RESPONSE_2
+RESPONSE_3
+RESPONSE_4
+RESPONSE_5
+RESPONSE_6
+RESPONSE_7
+RESPONSE_8
+RESPONSE_9
+RESPONSE_10
+RESPONSE_11
+RESPONSE_12
+RESPONSE_13
+RESPONSE_14
+RESPONSE_15
+RESPONSE_16
+RESPONSE_17
+RESPONSE_18
+RESPONSE_19
+RESPONSE_20
+RESPONSE_21
+RESPONSE_22
+RESPONSE_23
+RESPONSE_24
+RESPONSE_25
+RESPONSE_26
+RESPONSE_27
+RESPONSE_28
+RESPONSE_29
+RESPONSE_30
+RESPONSE_31
+RESPONSE_32
+RESPONSE_33
+RESPONSE_34
+RESPONSE_35
+RESPONSE_36
+RESPONSE_37
+RESPONSE_38
+RESPONSE_39
+RESPONSE_40
+RESPONSE_41
+RESPONSE_42
+RESPONSE_43
+RESPONSE_44
+RESPONSE_45
+RESPONSE_46
+RESPONSE_47
+RESPONSE_48
+RESPONSE_49
+RESPONSE_50
+RESPONSE_51
+RESPONSE_52
+RESPONSE_53
+RESPONSE_54
+RESPONSE_55
+RESPONSE_56
+RESPONSE_57
+RESPONSE_58
+RESPONSE_59
+RESPONSE_60
+RESPONSE_61
+RESPONSE_62
+RESPONSE_63
+RESPONSE_64
+RESPONSE_65
+RESPONSE_66
+RESPONSE_67
+RESPONSE_68
+RESPONSE_69
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv
new file mode 100644
index 000000000..3e6a3b5d4
--- /dev/null
+++ b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_true_score_eval.csv
@@ -0,0 +1,4 @@
+,N,N_single,N_double,h1_var_single,h1_var_double,h2_var_double,true_var,mse_true,prmse_true,sys_var_double,sys_var_single
+raw,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.6418279958491871,0.014717760578803696,0.0,0.0
+raw_trim,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.6418279958491871,0.014717760578803696,0.0,0.0
+raw_trim_round,69,20,49,0.5368421052631579,0.9209183673469389,0.788265306122449,0.6514153713216517,0.9062608094085092,-0.3912180297032344,0.0,0.0
diff --git a/tests/test_experiment_rsmeval.py b/tests/test_experiment_rsmeval.py
index 4fba5cc91..054490927 100644
--- a/tests/test_experiment_rsmeval.py
+++ b/tests/test_experiment_rsmeval.py
@@ -50,6 +50,8 @@
     param('lr-eval-with-subset-double-scored', 'lr_eval_with_subset_double_scored', consistency=True),
     param('lr-eval-with-trim-tolerance', 'lr_evaluation_with_trim_tolerance'),
     param('lr-eval-with-numeric-threshold', 'lr_evaluation_with_numeric_threshold', subgroups=['QUESTION']),
+    param('lr-eval-system-score-constant', 'lr_eval_system_score_constant',
+          subgroups=['QUESTION', 'L1'], consistency=True, suppress_warnings_for=[UserWarning])
 ])
 def test_run_experiment_parameterized(*args, **kwargs):
     if TEST_DIR:

From 8626fff2070b446f64f3641891d7db5c4713a06f Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 14:12:52 -0400
Subject: [PATCH 08/13] Added top level warning about the number of warnings

---
 rsmtool/analyzer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py
index f9c52b71f..c3e316f9b 100644
--- a/rsmtool/analyzer.py
+++ b/rsmtool/analyzer.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+import warnings
 
 from functools import partial
 
@@ -1198,6 +1199,17 @@ def compute_metrics_by_group(self,
                               for col in df_test.columns if col not in ['spkitemid',
                                                                         grouping_variable]}
 
+        # check if any of the standard deviations is zero and
+        # tell user to expect to see many warnings.
+        zero_sd_scores = [score for (score, sd) in population_sd_dict.items() if
+                          np.isclose(sd, 0, atol=1e-07)]
+        if len(zero_sd_scores) > 0:
+            warnings.warn("The standard deviation for {} scores "
+                          "is zero (all scores are the same). You "
+                          "will see multiple warnings about DSM computation "
+                          "since this metrics is computed separately for "
+                          "each subgroup.".format(', '.join(zero_sd_scores)))
+
         # create a duplicate data frame to compute evaluations
         # over the whole data, i.e., across groups
         df_preds_all = df_test.copy()

From ec61cd727a21b0d7140f0df04677982d50529387 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@users.noreply.github.com>
Date: Thu, 19 Mar 2020 14:14:20 -0400
Subject: [PATCH 09/13] Apply suggestions from code review

Co-Authored-By: Matt Mulholland <mulhodm@gmail.com>
Co-Authored-By: Nitin Madnani <nmadnani@ets.org>
---
 rsmtool/utils/metrics.py |  2 +-
 tests/test_utils.py      | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/rsmtool/utils/metrics.py b/rsmtool/utils/metrics.py
index b4bb5affd..960beb5cd 100644
--- a/rsmtool/utils/metrics.py
+++ b/rsmtool/utils/metrics.py
@@ -371,7 +371,7 @@ def difference_of_standardized_means(y_true_observed,
 
     # if the population means and standard deviations were not provided, calculate from the data
     # We only check for mean since the function requires
-    # both of these to be set of both to be None
+    # both of these to be set or both to be None
     if population_y_true_observed_mn is None:
 
         warnings.warn(warning_msg.format('y_true_observed'))
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5b0c8417c..4a6bdc9c9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -428,10 +428,10 @@ def test_difference_of_standardized_means_zero_population_sd_pred():
                       np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2]))
     expected = None
     diff_std_means = difference_of_standardized_means(y_true, y_pred,
-                                                     population_y_true_observed_mn=2.44,
-                                                     population_y_true_observed_sd=0.54,
-                                                     population_y_pred_mn=2.44,
-                                                     population_y_pred_sd=0)
+                                                      population_y_true_observed_mn=2.44,
+                                                      population_y_true_observed_sd=0.54,
+                                                      population_y_pred_mn=2.44,
+                                                      population_y_pred_sd=0)
     eq_(diff_std_means, expected)
 
 
@@ -440,10 +440,10 @@ def test_difference_of_standardized_means_zero_population_sd_human():
                       np.array([2, 1, 4, 1, 5, 2, 2, 2, 2, 2]))
     expected = None
     diff_std_means = difference_of_standardized_means(y_true, y_pred,
-                                                     population_y_pred_mn=2.44,
-                                                     population_y_pred_sd=0.54,
-                                                     population_y_true_observed_mn=2.44,
-                                                     population_y_true_observed_sd=0)
+                                                      population_y_pred_mn=2.44,
+                                                      population_y_pred_sd=0.54,
+                                                      population_y_true_observed_mn=2.44,
+                                                      population_y_true_observed_sd=0)
     eq_(diff_std_means, expected)
 
 

From ab96e80f078a70ec49385308dae85c2628794242 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 14:15:28 -0400
Subject: [PATCH 10/13] Missing test data file

---
 ...re_with_subgroups_subset_double_scored.csv | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv

diff --git a/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv b/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv
new file mode 100644
index 000000000..ab7d9fb03
--- /dev/null
+++ b/tests/data/files/predictions_same_system_score_with_subgroups_subset_double_scored.csv
@@ -0,0 +1,70 @@
+id,QUESTION,L1,score,h1,h2
+RESPONSE_1,QUESTION_1,Vulcan,3.536,4,4
+RESPONSE_2,QUESTION_1,Esperanto,3.536,4,4
+RESPONSE_3,QUESTION_1,Vulcan,3.536,4,3
+RESPONSE_4,QUESTION_1,Navi,3.536,3,4
+RESPONSE_5,QUESTION_1,Vulcan,3.536,2,3
+RESPONSE_6,QUESTION_1,Klingon,3.536,2,2
+RESPONSE_7,QUESTION_1,Klingon,3.536,4,4
+RESPONSE_8,QUESTION_1,Navi,3.536,2,2
+RESPONSE_9,QUESTION_1,Esperanto,3.536,5,4
+RESPONSE_10,QUESTION_1,Esperanto,3.536,3,3
+RESPONSE_11,QUESTION_1,Klingon,3.536,4,4
+RESPONSE_12,QUESTION_1,Esperanto,3.536,5,5
+RESPONSE_13,QUESTION_1,Esperanto,3.536,4,3
+RESPONSE_14,QUESTION_1,Navi,3.536,4,4
+RESPONSE_15,QUESTION_1,Vulcan,3.536,3,4
+RESPONSE_16,QUESTION_1,Vulcan,3.536,4,4
+RESPONSE_17,QUESTION_1,Navi,3.536,4,4
+RESPONSE_18,QUESTION_1,Esperanto,3.536,3,3
+RESPONSE_19,QUESTION_1,Klingon,3.536,2,3
+RESPONSE_20,QUESTION_1,Navi,3.536,4,4
+RESPONSE_21,QUESTION_1,Vulcan,3.536,4,3
+RESPONSE_22,QUESTION_1,Klingon,3.536,5,5
+RESPONSE_23,QUESTION_1,Esperanto,3.536,3,3
+RESPONSE_24,QUESTION_1,Klingon,3.536,3,3
+RESPONSE_25,QUESTION_1,Navi,3.536,4,4
+RESPONSE_26,QUESTION_1,Klingon,3.536,4,4
+RESPONSE_27,QUESTION_1,Navi,3.536,4,4
+RESPONSE_28,QUESTION_1,Vulcan,3.536,4,4
+RESPONSE_29,QUESTION_1,Esperanto,3.536,3,3
+RESPONSE_30,QUESTION_1,Navi,3.536,3,3
+RESPONSE_31,QUESTION_1,Vulcan,3.536,3,3
+RESPONSE_32,QUESTION_1,Klingon,3.536,3,3
+RESPONSE_33,QUESTION_1,Vulcan,3.536,1,1
+RESPONSE_34,QUESTION_1,Vulcan,3.536,4,4
+RESPONSE_35,QUESTION_1,Vulcan,3.536,4,3
+RESPONSE_36,QUESTION_1,Navi,3.536,3,3
+RESPONSE_37,QUESTION_1,Klingon,3.536,4,4
+RESPONSE_38,QUESTION_1,Vulcan,3.536,5,5
+RESPONSE_39,QUESTION_1,Vulcan,3.536,2,1
+RESPONSE_40,QUESTION_1,Vulcan,3.536,4,3
+RESPONSE_41,QUESTION_2,Navi,3.536,4,4
+RESPONSE_42,QUESTION_2,Esperanto,3.536,4,4
+RESPONSE_43,QUESTION_2,Esperanto,3.536,4,3
+RESPONSE_44,QUESTION_2,Klingon,3.536,3,4
+RESPONSE_45,QUESTION_2,Vulcan,3.536,2,3
+RESPONSE_46,QUESTION_2,Esperanto,3.536,2,2
+RESPONSE_47,QUESTION_2,Klingon,3.536,4,4
+RESPONSE_48,QUESTION_2,Esperanto,3.536,2,2
+RESPONSE_49,QUESTION_2,Navi,3.536,5,4
+RESPONSE_50,QUESTION_2,Vulcan,3.536,3,
+RESPONSE_51,QUESTION_2,Klingon,3.536,4,
+RESPONSE_52,QUESTION_2,Klingon,3.536,5,
+RESPONSE_53,QUESTION_2,Klingon,3.536,4,
+RESPONSE_54,QUESTION_2,Vulcan,3.536,4,
+RESPONSE_55,QUESTION_2,Klingon,3.536,3,
+RESPONSE_56,QUESTION_2,Vulcan,3.536,4,
+RESPONSE_57,QUESTION_2,Esperanto,3.536,4,
+RESPONSE_58,QUESTION_2,Esperanto,3.536,3,
+RESPONSE_59,QUESTION_2,Vulcan,3.536,2,
+RESPONSE_60,QUESTION_2,Navi,3.536,4,
+RESPONSE_61,QUESTION_2,Esperanto,3.536,4,
+RESPONSE_62,QUESTION_2,Vulcan,3.536,5,
+RESPONSE_63,QUESTION_2,Klingon,3.536,3,
+RESPONSE_64,QUESTION_2,Klingon,3.536,3,
+RESPONSE_65,QUESTION_2,Klingon,3.536,4,
+RESPONSE_66,QUESTION_2,Navi,3.536,4,
+RESPONSE_67,QUESTION_2,Vulcan,3.536,4,
+RESPONSE_68,QUESTION_2,Klingon,3.536,4,
+RESPONSE_69,QUESTION_2,Navi,3.536,3,
\ No newline at end of file

From d1af8da2e2dbf55876e6b7f4c7e96f99cb623518 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 15:08:14 -0400
Subject: [PATCH 11/13] Remove fairness test files

---
 .../lr_eval_system_score_constant_estimates_csd_by_L1.csv    | 5 -----
 ..._eval_system_score_constant_estimates_csd_by_QUESTION.csv | 3 ---
 .../lr_eval_system_score_constant_estimates_osa_by_L1.csv    | 5 -----
 ..._eval_system_score_constant_estimates_osa_by_QUESTION.csv | 3 ---
 .../lr_eval_system_score_constant_estimates_osd_by_L1.csv    | 5 -----
 ..._eval_system_score_constant_estimates_osd_by_QUESTION.csv | 3 ---
 6 files changed, 24 deletions(-)
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv

diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
deleted file mode 100644
index b4a2497ca..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_L1.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (Vulcan),-0.4640000000000002,0.0,-0.4640000000000012,-0.4639999999999992
-Klingon,1.2385925618474403e-15,0.06294831850852728,-6.89666061407639e-17,2.5461517298356442e-15
-Esperanto,3.0531133177191805e-16,0.6631333657891718,-1.089425140581065e-15,1.7000478041249011e-15
-Navi,1.2073675392798577e-15,0.09440671222808052,-2.1359961004765078e-16,2.628334688607366e-15
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
deleted file mode 100644
index 12fea80fd..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_csd_by_QUESTION.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (QUESTION_1),-0.4640000000000001,0.0,-0.4640000000000002,-0.46399999999999997
-QUESTION_2,1.0061396160665481e-16,0.22901607233412186,-6.491000204344407e-17,2.661379252567537e-16
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
deleted file mode 100644
index 2da86fc8a..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_L1.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (Vulcan),1.113581714285715,1.8337863271908833e-05,0.6327358647132078,1.5944275638582222
-Klingon,-0.4469172932330826,0.2053355684667301,-1.1446017808907327,0.2507671944245675
-Esperanto,-0.33135238095238073,0.37762537621397396,-1.0762755679472913,0.4135708060425298
-Navi,-0.5868571428571425,0.12803165113359088,-1.3471411869010543,0.1734269011867694
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
deleted file mode 100644
index 359081b96..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osa_by_QUESTION.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (QUESTION_1),0.8512959999999999,7.482596343683655e-06,0.5014640810687143,1.2011279189312856
-QUESTION_2,-0.12344827586206905,0.6494154859715497,-0.6630644751574266,0.41616792343328846
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
deleted file mode 100644
index 55152bc56..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_L1.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (Vulcan),0.10742857142857143,0.5934146767315952,-0.29244520951067887,0.5073023523678217
-Klingon,-0.15037593984962397,0.6064812844970744,-0.7305738020758628,0.4298219223766149
-Esperanto,-0.10476190476190476,0.7366472332129983,-0.7242437024269881,0.5147198929031787
-Navi,-0.21428571428571414,0.5008872399945372,-0.8465416764613416,0.41797024788991327
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv
deleted file mode 100644
index 2521c223b..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_estimates_osd_by_QUESTION.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,estimate,P>[t],[0.025,0.975]
-Intercept (QUESTION_1),0.03600000000000004,0.8024137046773936,-0.2500305373954332,0.32203053739543325
-QUESTION_2,-0.0862068965517241,0.6977724389285981,-0.5274093801422773,0.35499558703882916

From d4c75de8a61e78f4b38a4be815d5b19d5031497a Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@ets.org>
Date: Thu, 19 Mar 2020 15:23:23 -0400
Subject: [PATCH 12/13] Remove few more fairness files

---
 .../lr_eval_system_score_constant_fairness_metrics_by_L1.csv   | 3 ---
 ...eval_system_score_constant_fairness_metrics_by_QUESTION.csv | 3 ---
 2 files changed, 6 deletions(-)
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
 delete mode 100644 tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv

diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
deleted file mode 100644
index 186e4a94d..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_L1.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,Overall score accuracy,Overall score difference,Conditional score difference,base_category
-R2,-0.002410098148508011,-0.03785877645323943,0.0,Vulcan
-sig,0.42391077866056215,0.9141656650925646,1.0,Vulcan
diff --git a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv b/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv
deleted file mode 100644
index b3a1232cb..000000000
--- a/tests/data/experiments/lr-eval-system-score-constant/output/lr_eval_system_score_constant_fairness_metrics_by_QUESTION.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,Overall score accuracy,Overall score difference,Conditional score difference,base_category
-R2,-0.01177664867159045,-0.01262654058967505,0.0,QUESTION_1
-sig,0.6494154859715568,0.6977724389286026,1.5088907880804768e-22,QUESTION_1

From b65a0acfdbc398e0232ae1545bef5e4188f8dc69 Mon Sep 17 00:00:00 2001
From: Anastassia Loukina <aloukina@users.noreply.github.com>
Date: Fri, 20 Mar 2020 10:42:14 -0400
Subject: [PATCH 13/13] Update rsmtool/analyzer.py

Co-Authored-By: Nitin Madnani <nmadnani@ets.org>
---
 rsmtool/analyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rsmtool/analyzer.py b/rsmtool/analyzer.py
index c3e316f9b..f2f81af31 100644
--- a/rsmtool/analyzer.py
+++ b/rsmtool/analyzer.py
@@ -1205,9 +1205,9 @@ def compute_metrics_by_group(self,
                           np.isclose(sd, 0, atol=1e-07)]
         if len(zero_sd_scores) > 0:
             warnings.warn("The standard deviation for {} scores "
-                          "is zero (all scores are the same). You "
+                          "is zero (all values are the same). You "
                           "will see multiple warnings about DSM computation "
-                          "since this metrics is computed separately for "
+                          "since this metric is computed separately for "
                           "each subgroup.".format(', '.join(zero_sd_scores)))
 
         # create a duplicate data frame to compute evaluations