Skip to content

Commit

Permalink
Merge 860a23b into e1ee0b7
Browse files Browse the repository at this point in the history
  • Loading branch information
aloukina committed Aug 3, 2020
2 parents e1ee0b7 + 860a23b commit 405a45a
Show file tree
Hide file tree
Showing 64 changed files with 4,083 additions and 83 deletions.
158 changes: 86 additions & 72 deletions rsmtool/utils/prmse.py
@@ -1,4 +1,6 @@
"""
PRMSE utilities.
Utility classes and functions related to computing test
theory based evaluations.
Expand All @@ -13,6 +15,8 @@
import pandas as pd
import numpy as np

import warnings


def get_n_human_scores(human_scores):
"""
Expand Down Expand Up @@ -46,7 +50,6 @@ def variance_of_errors(human_scores):
variance_of_errors : float
Estimated variance of errors in human scores.
"""

# we first compute the total number of scores
# available for each response

Expand All @@ -56,35 +59,35 @@ def variance_of_errors(human_scores):
# than one score
multiple_mask = n_scores > 1

# raise an error if we don't have any such responses
# show a warning and return None
# if we don't have valid human scores
if multiple_mask.sum() == 0:
raise ValueError("Variance of human errors "
"necessary for true score "
"evaluations requires "
"at least a subset of responses "
"to be scored by 2 or more "
"raters.")
warnings.warn("True score evaluations cannot be "
"computed because none of the responses in the "
"evaluation set has valid "
"system scores and 2 human scores.")
return None

# only select the responses with multiple scores
multiple_scores = human_scores[multiple_mask]
else:
# only select the responses with multiple scores
multiple_scores = human_scores[multiple_mask]

n_scores = n_scores[multiple_mask]
n_scores = n_scores[multiple_mask]

# now let's compute the rater error variance for each
# response
response_variances = np.nanvar(multiple_scores, ddof=1, axis=1)
# now let's compute the rater error variance for each
# response
response_variances = np.nanvar(multiple_scores, ddof=1, axis=1)

# finally, let's compute the variance of errors as a weighted average
# of response variances
# finally, let's compute the variance of errors as a weighted average
# of response variances

variance_of_errors = np.average(response_variances, weights=n_scores - 1)
variance_of_errors = np.average(response_variances, weights=n_scores - 1)

return variance_of_errors
return variance_of_errors


def true_score_variance(human_scores,
variance_errors_human=None):

"""
Compute variance of true scores for multiple raters.
Expand All @@ -106,57 +109,62 @@ def true_score_variance(human_scores,
variance_true_scores : float
Variance of true scores.
"""

# if we don't have variance of errors, compute it
# from the data

if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)

# compute mean human score and total number of scores
# for each response
mean_scores = np.nanmean(human_scores, axis=1)
n_scores = get_n_human_scores(human_scores)
# if it's still None, return None
if variance_errors_human is None:
return None

else:
# compute mean human score and total number of scores
# for each response
mean_scores = np.nanmean(human_scores, axis=1)
n_scores = get_n_human_scores(human_scores)

# compute overall mean
mean_human_score = np.nanmean(human_scores)
# compute overall mean
mean_human_score = np.nanmean(human_scores)

# let N be total number of responses
N = len(human_scores)
# let N be total number of responses
N = len(human_scores)

# let M be total number of human ratings
M = n_scores.sum()
# let M be total number of human ratings
M = n_scores.sum()

# compute squared deviations
squared_devs = (mean_scores - mean_human_score)**2
# compute squared deviations
squared_devs = (mean_scores - mean_human_score)**2

# adjust them by the number of human scores available
# for each responses: deviations with higher number of
# human scores are assigned a greater weight
adjusted_squared_devs = n_scores * squared_devs
# adjust them by the number of human scores available
# for each responses: deviations with higher number of
# human scores are assigned a greater weight
adjusted_squared_devs = n_scores * squared_devs

# compute sum of squares
sum_of_squares = adjusted_squared_devs.sum()
# compute sum of squares
sum_of_squares = adjusted_squared_devs.sum()

# now compute the numerator as sum of squares
# adjusted for the variance of human errors
numerator = sum_of_squares - (N-1) * variance_errors_human
# now compute the numerator as sum of squares
# adjusted for the variance of human errors
numerator = sum_of_squares - (N - 1) * variance_errors_human

# compute the denominator as the adjusted total number of scores
denominator = M - ((n_scores**2).sum() / M)
# compute the denominator as the adjusted total number of scores
denominator = M - ((n_scores**2).sum() / M)

# finally compute variance of true scores
variance_true_scores = numerator / denominator
# finally compute variance of true scores
variance_true_scores = numerator / denominator

return variance_true_scores
return variance_true_scores


def mse_true(system,
human_scores,
variance_errors_human=None):

"""
Compute mean squared error (MSE) when predicting true score
Mean squared error (MSE).
Compute MSE when predicting true score
from system score.
Parameters
Expand All @@ -178,31 +186,37 @@ def mse_true(system,
variance_true_scores : float
Variance of true scores.
"""

# if we don't have variance of errors, compute it
# from the data

if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)

# if it's still None, return None
if variance_errors_human is None:
return None

else:

# get total number of scores for each response
n_scores = get_n_human_scores(human_scores)
mean_scores = np.nanmean(human_scores, axis=1)
# get total number of scores for each response
n_scores = get_n_human_scores(human_scores)
mean_scores = np.nanmean(human_scores, axis=1)

N = len(system)
N = len(system)

se = ((mean_scores - system)**2) * n_scores
se = ((mean_scores - system)**2) * n_scores

# Compute mean squared error when predicting true score
mse = (se.sum() - N * variance_errors_human) / n_scores.sum()
# Compute mean squared error when predicting true score
mse = (se.sum() - N * variance_errors_human) / n_scores.sum()
return mse


def prmse_true(system,
human_scores,
variance_errors_human=None):
"""
PRMSE.
Compute Proportional Reduction in Mean Squared Error (PRMSE)
when predicting true score from system scores.
The formula to compute PRMSE implemented in RSMTool
Expand All @@ -228,7 +242,6 @@ def prmse_true(system,
prmse : float
Proportional reduction in mean squared error
"""

# check that human_scors is a two dimensional array
# and reshape if necessary
if len(human_scores.shape) == 1:
Expand All @@ -240,25 +253,30 @@ def prmse_true(system,
except AttributeError:
human_scores = human_scores.reshape(current_length, 1)


if variance_errors_human is None:
variance_errors_human = variance_of_errors(human_scores)

variance_true = true_score_variance(human_scores, variance_errors_human)
# if it's still None, return None
if variance_errors_human is None:
return None

else:

variance_true = true_score_variance(human_scores, variance_errors_human)

mse = mse_true(system, human_scores, variance_errors_human)
mse = mse_true(system, human_scores, variance_errors_human)

prmse = 1 - (mse / variance_true)
prmse = 1 - (mse / variance_true)

return prmse
return prmse


def get_true_score_evaluations(df,
system_score_columns,
human_score_columns,
variance_errors_human=None):
"""
Get true score evaluations for reporting
Get true score evaluations for reporting.
Parameters
----------
Expand All @@ -281,8 +299,6 @@ def get_true_score_evaluations(df,
some responses must have more than one
human rating.
Returns
-------
prmse_metrics: pandas DataFrame
Expand All @@ -297,17 +313,16 @@ def get_true_score_evaluations(df,
- ``mse_true``: mean squared error when predicting true score from machine score
- ``prmse``: proportional reduction in mean squared error when predicting true score
"""

# check that if we only have one human column, we were also given
# variance of errors
if isinstance(human_score_columns, str):
if variance_errors_human is None:
raise(ValueError("True score evaluations require estimating "
"variance of human errors, "
"which can only be computed when a subset "
"of responses has two or more human ratings. "
"If a single human_score_column "
"is supplied, one must also specify variance_errors_human"))
"variance of human errors, "
"which can only be computed when a subset "
"of responses has two or more human ratings. "
"If a single human_score_column "
"is supplied, one must also specify variance_errors_human"))

if isinstance(system_score_columns, str):
system_score_columns = [system_score_columns]
Expand All @@ -329,7 +344,6 @@ def get_true_score_evaluations(df,

score_counts = get_n_human_scores(df[human_score_columns])


# compute values that are the same for all scores
df_prmse.insert(0, 'N', len(df))
df_prmse.insert(1, 'N raters', score_counts.max())
Expand Down
@@ -0,0 +1,9 @@
feature,sign,transform
FEATURE1,1,raw
FEATURE2,1,raw
FEATURE3,1,raw
FEATURE4,1,raw
FEATURE5,1,raw
FEATURE6,1,raw
FEATURE7,1,raw
FEATURE8,1,raw
@@ -0,0 +1,19 @@
{
"train_file": "../../files/train.csv",
"id_column": "ID",
"use_scaled_predictions": true,
"test_label_column": "score",
"train_label_column": "score",
"test_file": "../../files/test_with_no_double_scored_responses_for_valid_system_scores.csv",
"second_human_score_column": "score2",
"subgroups": [
"QUESTION",
"L1"
],
"trim_max": 6,
"features": "features.csv",
"trim_min": 1,
"model": "LinearRegression",
"experiment_id": "lr_subgroups_with_h2_but_only_for_nonscoreable",
"description": "Using all features with an LinearRegression model."
}
@@ -0,0 +1,9 @@
feature,standardized,relative
FEATURE1,0.4121353697173701,0.36173324876232976
FEATURE2,0.042308527446622196,0.037134403422138064
FEATURE3,0.2087431643498745,0.18321490593974268
FEATURE4,0.1465987367652527,0.1286704350342815
FEATURE5,0.04932715247603867,0.0432946852622444
FEATURE6,0.16537948164673932,0.14515438753953605
FEATURE7,-0.05562887738427005,-0.04882574032656531
FEATURE8,-0.05921374201648318,-0.05197219371316232
@@ -0,0 +1,10 @@
feature,coefficient
Intercept,3.4200000000000026
FEATURE1,0.3360684984636124
FEATURE2,0.03449974046377896
FEATURE3,0.17021592166601351
FEATURE4,0.11954134724022067
FEATURE5,0.040222954116935355
FEATURE6,0.13485577350913724
FEATURE7,-0.045361584244925625
FEATURE8,-0.04828479870955494
@@ -0,0 +1,10 @@
feature,coefficient
Intercept,3.4200000000000026
FEATURE1,0.4766778562253703
FEATURE2,0.0489342571523002
FEATURE3,0.24143340124443724
FEATURE4,0.16955684151673245
FEATURE5,0.057052034413121774
FEATURE6,0.19127874617772242
FEATURE7,-0.0643406413624301
FEATURE8,-0.06848691395464501
@@ -0,0 +1,6 @@
,1,2,3,4,5
1,2,5,0,0,0
2,2,9,0,0,0
3,0,2,31,16,0
4,0,1,12,48,6
5,0,0,0,8,9
@@ -0,0 +1,2 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,SMD
,0,,,,,,,,,,,,,,
@@ -0,0 +1,6 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
All data,0,,,,,,,,,,,,,,
Esperanto,0,,,,,,,,,,,,,,
Klingon,0,,,,,,,,,,,,,,
Navi,0,,,,,,,,,,,,,,
Vulcan,0,,,,,,,,,,,,,,
@@ -0,0 +1,6 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
All data,0,,,,,,,,,,,,,,
QUESTION_2,0,,,,,,,,,,,,,,
QUESTION_3,0,,,,,,,,,,,,,,
QUESTION_4,0,,,,,,,,,,,,,,
QUESTION_5,0,,,,,,,,,,,,,,
@@ -0,0 +1,10 @@
,FEATURE1,FEATURE2,FEATURE3,FEATURE4,FEATURE5,FEATURE6,FEATURE7,FEATURE8,sc1
FEATURE1,1.0,0.25046600963484905,0.40799026832024865,0.464160236171879,0.21761011536933708,-0.07898604351813233,-0.04283682647661093,-0.02006754059967186,0.5763971598016531
FEATURE2,0.25046600963484905,1.0,0.358482299657574,0.32486005717920496,0.4019871198703128,0.2365570243500153,-0.007175576295387261,-0.07892924727661298,0.33756930655835915
FEATURE3,0.40799026832024865,0.358482299657574,1.0,0.34108689219085514,0.36679572765697566,0.2866651664877243,0.02484802374346811,-0.04718278554012104,0.509507228821961
FEATURE4,0.464160236171879,0.32486005717920496,0.34108689219085514,1.0,0.3252672146756019,0.2590527982381009,-0.015737371226134094,-0.09071074511789955,0.48752876987183547
FEATURE5,0.21761011536933708,0.4019871198703128,0.36679572765697566,0.3252672146756019,1.0,0.2679408186139726,-0.006374590888293129,-0.0829554096784736,0.32954217319350565
FEATURE6,-0.07898604351813233,0.2365570243500153,0.2866651664877243,0.2590527982381009,0.2679408186139726,1.0,0.02318904560344916,-0.0337084764523672,0.25329384574569086
FEATURE7,-0.04283682647661093,-0.007175576295387261,0.02484802374346811,-0.015737371226134094,-0.006374590888293129,0.02318904560344916,1.0,0.035118707729159815,-0.06912827010818752
FEATURE8,-0.02006754059967186,-0.07892924727661298,-0.04718278554012104,-0.09071074511789955,-0.0829554096784736,-0.0337084764523672,0.035118707729159815,1.0,-0.10548661845698235
sc1,0.5763971598016531,0.33756930655835915,0.509507228821961,0.48752876987183547,0.32954217319350565,0.25329384574569086,-0.06912827010818752,-0.10548661845698235,1.0

0 comments on commit 405a45a

Please sign in to comment.