Merge 860a23b into e1ee0b7

EducationalTestingService · Aug 3, 2020 · 405a45a · 405a45a
2 parents e1ee0b7 + 860a23b
commit 405a45a
Show file tree

Hide file tree

Showing 64 changed files with 4,083 additions and 83 deletions.
diff --git a/rsmtool/utils/prmse.py b/rsmtool/utils/prmse.py
@@ -1,4 +1,6 @@
 """
+PRMSE utilities.
+
 Utility classes and functions related to computing test
 theory based evaluations.
 
@@ -13,6 +15,8 @@
 import pandas as pd
 import numpy as np
 
+import warnings
+
 
 def get_n_human_scores(human_scores):
     """
@@ -46,7 +50,6 @@ def variance_of_errors(human_scores):
     variance_of_errors : float
         Estimated variance of errors in human scores.
     """
-
     # we first compute the total number of scores
     # available for each response
 
@@ -56,35 +59,35 @@ def variance_of_errors(human_scores):
     # than one score
     multiple_mask = n_scores > 1
 
-    # raise an error if we don't have any such responses
+    # show a warning and return None
+    # if we don't have valid human scores
     if multiple_mask.sum() == 0:
-        raise ValueError("Variance of human errors "
-                         "necessary for true score "
-                         "evaluations requires "
-                         "at least a subset of responses "
-                         "to be scored by 2 or more "
-                         "raters.")
+        warnings.warn("True score evaluations cannot be "
+                      "computed because none of the responses in the "
+                      "evaluation set has valid "
+                      "system scores and 2 human scores.")
+        return None
 
-    # only select the responses with multiple scores
-    multiple_scores = human_scores[multiple_mask]
+    else:
+        # only select the responses with multiple scores
+        multiple_scores = human_scores[multiple_mask]
 
-    n_scores = n_scores[multiple_mask]
+        n_scores = n_scores[multiple_mask]
 
-    # now let's compute the rater error variance for each
-    # response
-    response_variances = np.nanvar(multiple_scores, ddof=1, axis=1)
+        # now let's compute the rater error variance for each
+        # response
+        response_variances = np.nanvar(multiple_scores, ddof=1, axis=1)
 
-    # finally, let's compute the variance of errors as a weighted average
-    # of response variances
+        # finally, let's compute the variance of errors as a weighted average
+        # of response variances
 
-    variance_of_errors = np.average(response_variances, weights=n_scores - 1)
+        variance_of_errors = np.average(response_variances, weights=n_scores - 1)
 
-    return variance_of_errors
+        return variance_of_errors
 
 
 def true_score_variance(human_scores,
                         variance_errors_human=None):
-
     """
     Compute variance of true scores for multiple raters.
 
@@ -106,57 +109,62 @@ def true_score_variance(human_scores,
     variance_true_scores : float
         Variance of true scores.
     """
-
     # if we don't have variance of errors, compute it
     # from the data
 
     if variance_errors_human is None:
         variance_errors_human = variance_of_errors(human_scores)
 
-    # compute mean human score and total number of scores
-    # for each response
-    mean_scores = np.nanmean(human_scores, axis=1)
-    n_scores = get_n_human_scores(human_scores)
+    # if it's still None, return None
+    if variance_errors_human is None:
+        return None
+
+    else:
+        # compute mean human score and total number of scores
+        # for each response
+        mean_scores = np.nanmean(human_scores, axis=1)
+        n_scores = get_n_human_scores(human_scores)
 
-    # compute overall mean
-    mean_human_score = np.nanmean(human_scores)
+        # compute overall mean
+        mean_human_score = np.nanmean(human_scores)
 
-    # let N be total number of responses
-    N = len(human_scores)
+        # let N be total number of responses
+        N = len(human_scores)
 
-    # let M be total number of human ratings
-    M = n_scores.sum()
+        # let M be total number of human ratings
+        M = n_scores.sum()
 
-    # compute squared deviations
-    squared_devs = (mean_scores - mean_human_score)**2
+        # compute squared deviations
+        squared_devs = (mean_scores - mean_human_score)**2
 
-    # adjust them by the number of human scores available
-    # for each responses: deviations with higher number of
-    # human scores are assigned a greater weight
-    adjusted_squared_devs = n_scores * squared_devs
+        # adjust them by the number of human scores available
+        # for each responses: deviations with higher number of
+        # human scores are assigned a greater weight
+        adjusted_squared_devs = n_scores * squared_devs
 
-    # compute sum of squares
-    sum_of_squares = adjusted_squared_devs.sum()
+        # compute sum of squares
+        sum_of_squares = adjusted_squared_devs.sum()
 
-    # now compute the numerator as sum of squares
-    # adjusted for the variance of human errors
-    numerator = sum_of_squares - (N-1) * variance_errors_human
+        # now compute the numerator as sum of squares
+        # adjusted for the variance of human errors
+        numerator = sum_of_squares - (N - 1) * variance_errors_human
 
-    # compute the denominator as the adjusted total number of scores
-    denominator = M - ((n_scores**2).sum() / M)
+        # compute the denominator as the adjusted total number of scores
+        denominator = M - ((n_scores**2).sum() / M)
 
-    # finally compute variance of true scores
-    variance_true_scores = numerator / denominator
+        # finally compute variance of true scores
+        variance_true_scores = numerator / denominator
 
-    return variance_true_scores
+        return variance_true_scores
 
 
 def mse_true(system,
              human_scores,
              variance_errors_human=None):
-
     """
-    Compute mean squared error (MSE) when predicting true score
+    Mean squared error (MSE).
+
+    Compute MSE when predicting true score
     from system score.
 
     Parameters
@@ -178,31 +186,37 @@ def mse_true(system,
     variance_true_scores : float
         Variance of true scores.
     """
-
     # if we don't have variance of errors, compute it
     # from the data
 
     if variance_errors_human is None:
         variance_errors_human = variance_of_errors(human_scores)
 
+    # if it's still None, return None
+    if variance_errors_human is None:
+        return None
+
+    else:
 
-    # get total number of scores for each response
-    n_scores = get_n_human_scores(human_scores)
-    mean_scores = np.nanmean(human_scores, axis=1)
+        # get total number of scores for each response
+        n_scores = get_n_human_scores(human_scores)
+        mean_scores = np.nanmean(human_scores, axis=1)
 
-    N = len(system)
+        N = len(system)
 
-    se = ((mean_scores - system)**2) * n_scores
+        se = ((mean_scores - system)**2) * n_scores
 
-    # Compute mean squared error when predicting true score
-    mse = (se.sum() - N * variance_errors_human) / n_scores.sum()
+        # Compute mean squared error when predicting true score
+        mse = (se.sum() - N * variance_errors_human) / n_scores.sum()
     return mse
 
 
 def prmse_true(system,
                human_scores,
                variance_errors_human=None):
     """
+    PRMSE.
+
     Compute Proportional Reduction in Mean Squared Error (PRMSE)
     when predicting true score from system scores.
     The formula to compute PRMSE implemented in RSMTool
@@ -228,7 +242,6 @@ def prmse_true(system,
     prmse : float
         Proportional reduction in mean squared error
     """
-
     # check that human_scors is a two dimensional array
     # and reshape if necessary
     if len(human_scores.shape) == 1:
@@ -240,25 +253,30 @@ def prmse_true(system,
         except AttributeError:
             human_scores = human_scores.reshape(current_length, 1)
 
-
     if variance_errors_human is None:
         variance_errors_human = variance_of_errors(human_scores)
 
-    variance_true = true_score_variance(human_scores, variance_errors_human)
+    # if it's still None, return None
+    if variance_errors_human is None:
+        return None
+
+    else:
+
+        variance_true = true_score_variance(human_scores, variance_errors_human)
 
-    mse = mse_true(system, human_scores, variance_errors_human)
+        mse = mse_true(system, human_scores, variance_errors_human)
 
-    prmse = 1 - (mse / variance_true)
+        prmse = 1 - (mse / variance_true)
 
-    return prmse
+        return prmse
 
 
 def get_true_score_evaluations(df,
                                system_score_columns,
                                human_score_columns,
                                variance_errors_human=None):
     """
-    Get true score evaluations for reporting
+    Get true score evaluations for reporting.
 
     Parameters
     ----------
@@ -281,8 +299,6 @@ def get_true_score_evaluations(df,
         some responses must have more than one
         human rating.
 
-
-
     Returns
     -------
     prmse_metrics: pandas DataFrame
@@ -297,17 +313,16 @@ def get_true_score_evaluations(df,
         - ``mse_true``: mean squared error when predicting true score from machine score
         - ``prmse``: proportional reduction in mean squared error when predicting true score
     """
-
     # check that if we only have one human column, we were also given
     # variance of errors
     if isinstance(human_score_columns, str):
         if variance_errors_human is None:
             raise(ValueError("True score evaluations require estimating "
-                            "variance of human errors, "
-                            "which can only be computed when a subset "
-                            "of responses has two or more human ratings. "
-                            "If a single human_score_column "
-                            "is supplied, one must also specify variance_errors_human"))
+                             "variance of human errors, "
+                             "which can only be computed when a subset "
+                             "of responses has two or more human ratings. "
+                             "If a single human_score_column "
+                             "is supplied, one must also specify variance_errors_human"))
 
     if isinstance(system_score_columns, str):
         system_score_columns = [system_score_columns]
@@ -329,7 +344,6 @@ def get_true_score_evaluations(df,
 
     score_counts = get_n_human_scores(df[human_score_columns])
 
-
     # compute values that are the same for all scores
     df_prmse.insert(0, 'N', len(df))
     df_prmse.insert(1, 'N raters', score_counts.max())

diff --git a/tests/data/experiments/lr-subgroups-with-h2-but-only-for-nonscoreable/features.csv b/tests/data/experiments/lr-subgroups-with-h2-but-only-for-nonscoreable/features.csv
@@ -0,0 +1,9 @@
+feature,sign,transform
+FEATURE1,1,raw
+FEATURE2,1,raw
+FEATURE3,1,raw
+FEATURE4,1,raw
+FEATURE5,1,raw
+FEATURE6,1,raw
+FEATURE7,1,raw
+FEATURE8,1,raw
diff --git a/...ups-with-h2-but-only-for-nonscoreable/lr_subgroups_with_h2_but_only_for_nonscoreable.json b/...ups-with-h2-but-only-for-nonscoreable/lr_subgroups_with_h2_but_only_for_nonscoreable.json
@@ -0,0 +1,19 @@
+{
+    "train_file": "../../files/train.csv",
+    "id_column": "ID",
+    "use_scaled_predictions": true,
+    "test_label_column": "score",
+    "train_label_column": "score",
+    "test_file": "../../files/test_with_no_double_scored_responses_for_valid_system_scores.csv",
+    "second_human_score_column": "score2",
+    "subgroups": [
+        "QUESTION",
+        "L1"
+    ],
+    "trim_max": 6,
+    "features": "features.csv",
+    "trim_min": 1,
+    "model": "LinearRegression",
+    "experiment_id": "lr_subgroups_with_h2_but_only_for_nonscoreable",
+    "description": "Using all features with an LinearRegression model."
+}
diff --git a/...but-only-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_betas.csv b/...but-only-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_betas.csv
@@ -0,0 +1,9 @@
+feature,standardized,relative
+FEATURE1,0.4121353697173701,0.36173324876232976
+FEATURE2,0.042308527446622196,0.037134403422138064
+FEATURE3,0.2087431643498745,0.18321490593974268
+FEATURE4,0.1465987367652527,0.1286704350342815
+FEATURE5,0.04932715247603867,0.0432946852622444
+FEATURE6,0.16537948164673932,0.14515438753953605
+FEATURE7,-0.05562887738427005,-0.04882574032656531
+FEATURE8,-0.05921374201648318,-0.05197219371316232
diff --git a/...y-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_coefficients.csv b/...y-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_coefficients.csv
@@ -0,0 +1,10 @@
+feature,coefficient
+Intercept,3.4200000000000026
+FEATURE1,0.3360684984636124
+FEATURE2,0.03449974046377896
+FEATURE3,0.17021592166601351
+FEATURE4,0.11954134724022067
+FEATURE5,0.040222954116935355
+FEATURE6,0.13485577350913724
+FEATURE7,-0.045361584244925625
+FEATURE8,-0.04828479870955494
diff --git a/...onscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_coefficients_scaled.csv b/...onscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_coefficients_scaled.csv
@@ -0,0 +1,10 @@
+feature,coefficient
+Intercept,3.4200000000000026
+FEATURE1,0.4766778562253703
+FEATURE2,0.0489342571523002
+FEATURE3,0.24143340124443724
+FEATURE4,0.16955684151673245
+FEATURE5,0.057052034413121774
+FEATURE6,0.19127874617772242
+FEATURE7,-0.0643406413624301
+FEATURE8,-0.06848691395464501
diff --git a/...nly-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_confMatrix.csv b/...nly-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_confMatrix.csv
@@ -0,0 +1,6 @@
+,1,2,3,4,5
+1,2,5,0,0,0
+2,2,9,0,0,0
+3,0,2,31,16,0
+4,0,1,12,48,6
+5,0,0,0,8,9
diff --git a/...ly-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency.csv b/...ly-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency.csv
@@ -0,0 +1,2 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,SMD
+,0,,,,,,,,,,,,,,
diff --git a/...-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency_by_L1.csv b/...-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency_by_L1.csv
@@ -0,0 +1,6 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
+All data,0,,,,,,,,,,,,,,
+Esperanto,0,,,,,,,,,,,,,,
+Klingon,0,,,,,,,,,,,,,,
+Navi,0,,,,,,,,,,,,,,
+Vulcan,0,,,,,,,,,,,,,,
diff --git a/...oreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency_by_QUESTION.csv b/...oreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_consistency_by_QUESTION.csv
@@ -0,0 +1,6 @@
+,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
+All data,0,,,,,,,,,,,,,,
+QUESTION_2,0,,,,,,,,,,,,,,
+QUESTION_3,0,,,,,,,,,,,,,,
+QUESTION_4,0,,,,,,,,,,,,,,
+QUESTION_5,0,,,,,,,,,,,,,,
diff --git a/...only-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_cors_orig.csv b/...only-for-nonscoreable/output/lr_subgroups_with_h2_but_only_for_nonscoreable_cors_orig.csv
@@ -0,0 +1,10 @@
+,FEATURE1,FEATURE2,FEATURE3,FEATURE4,FEATURE5,FEATURE6,FEATURE7,FEATURE8,sc1
+FEATURE1,1.0,0.25046600963484905,0.40799026832024865,0.464160236171879,0.21761011536933708,-0.07898604351813233,-0.04283682647661093,-0.02006754059967186,0.5763971598016531
+FEATURE2,0.25046600963484905,1.0,0.358482299657574,0.32486005717920496,0.4019871198703128,0.2365570243500153,-0.007175576295387261,-0.07892924727661298,0.33756930655835915
+FEATURE3,0.40799026832024865,0.358482299657574,1.0,0.34108689219085514,0.36679572765697566,0.2866651664877243,0.02484802374346811,-0.04718278554012104,0.509507228821961
+FEATURE4,0.464160236171879,0.32486005717920496,0.34108689219085514,1.0,0.3252672146756019,0.2590527982381009,-0.015737371226134094,-0.09071074511789955,0.48752876987183547
+FEATURE5,0.21761011536933708,0.4019871198703128,0.36679572765697566,0.3252672146756019,1.0,0.2679408186139726,-0.006374590888293129,-0.0829554096784736,0.32954217319350565
+FEATURE6,-0.07898604351813233,0.2365570243500153,0.2866651664877243,0.2590527982381009,0.2679408186139726,1.0,0.02318904560344916,-0.0337084764523672,0.25329384574569086
+FEATURE7,-0.04283682647661093,-0.007175576295387261,0.02484802374346811,-0.015737371226134094,-0.006374590888293129,0.02318904560344916,1.0,0.035118707729159815,-0.06912827010818752
+FEATURE8,-0.02006754059967186,-0.07892924727661298,-0.04718278554012104,-0.09071074511789955,-0.0829554096784736,-0.0337084764523672,0.035118707729159815,1.0,-0.10548661845698235
+sc1,0.5763971598016531,0.33756930655835915,0.509507228821961,0.48752876987183547,0.32954217319350565,0.25329384574569086,-0.06912827010818752,-0.10548661845698235,1.0