Skip to content

Commit

Permalink
Merge d4c75de into 3d0c691
Browse files Browse the repository at this point in the history
  • Loading branch information
aloukina committed Mar 19, 2020
2 parents 3d0c691 + d4c75de commit 020dde0
Show file tree
Hide file tree
Showing 29 changed files with 552 additions and 9 deletions.
22 changes: 17 additions & 5 deletions rsmtool/analyzer.py
Expand Up @@ -10,6 +10,7 @@

import numpy as np
import pandas as pd
import warnings

from functools import partial

Expand Down Expand Up @@ -601,8 +602,8 @@ def metrics_helper(human_scores,
smd_method='unpooled',
use_diff_std_means=False):
"""
This is a helper function that computes some basic agreement
and association metrics between the system scores and the
This is a helper function that computes several basic
association metrics between the system scores and the
human scores.
Parameters
Expand All @@ -620,16 +621,16 @@ def metrics_helper(human_scores,
population_system_score_sd : float, optional
Reference standard deviation for system scores. If `smd_method='williamson'`, this is
used to compute SMD and should be the standard deviation for the whole population.If
`use_diff_std_means=True`, this must be used with `population_human_score_mn`.
`use_diff_std_means=True`, this must be used with `population_system_score_mn`.
Otherwise, it is ignored.
Defaults to None.
population_human_score_mn : float, optional
Reference mean for human scores. If `use_diff_std_means=True`, this must be used with
`population_human_score_mn`. Otherwise, it is ignored.
`population_human_score_sd`. Otherwise, it is ignored.
Defaults to None.
population_system_score_mn : float, optional
Reference mean for system scores. If `use_diff_std_means=True`, this must be used with
`population_human_score_mn`. Otherwise, it is ignored.
`population_system_score_sd`. Otherwise, it is ignored.
Defaults to None.
smd_method : {'williamson', 'johnson', pooled', 'unpooled'}, optional
The SMD method to use, only used if `use_diff_std_means=False`.
Expand Down Expand Up @@ -1198,6 +1199,17 @@ def compute_metrics_by_group(self,
for col in df_test.columns if col not in ['spkitemid',
grouping_variable]}

# check if any of the standard deviations is zero and
# tell user to expect to see many warnings.
zero_sd_scores = [score for (score, sd) in population_sd_dict.items() if
np.isclose(sd, 0, atol=1e-07)]
if len(zero_sd_scores) > 0:
warnings.warn("The standard deviation for {} scores "
"is zero (all scores are the same). You "
"will see multiple warnings about DSM computation "
"since this metrics is computed separately for "
"each subgroup.".format(', '.join(zero_sd_scores)))

# create a duplicate data frame to compute evaluations
# over the whole data, i.e., across groups
df_preds_all = df_test.copy()
Expand Down
26 changes: 22 additions & 4 deletions rsmtool/utils/metrics.py
Expand Up @@ -356,32 +356,50 @@ def difference_of_standardized_means(y_true_observed,
y_pred_population_params = [population_y_pred_mn,
population_y_pred_sd]

if any(y_true_observed_population_params) and not all(y_true_observed_population_params):
if len([param for param in y_true_observed_population_params
if param is None]) == 1:
raise ValueError('You must pass both `population_y_true_observed_mn` and '
'`population_y_true_observed_sd` or neither.')

if any(y_pred_population_params) and not all(y_pred_population_params):
if len([param for param in y_pred_population_params
if param is None]) == 1:
raise ValueError('You must pass both `population_y_pred_mn` and '
'`population_y_pred_sd` or neither.')

warning_msg = ('You did not pass population mean and std. for `{}`; '
'thus, the calculated z-scores will be zero.')

# if the population means and standard deviations were not provided, calculate from the data
if not population_y_true_observed_mn or not population_y_true_observed_sd:
# We only check for mean since the function requires
# both of these to be set or both to be None
if population_y_true_observed_mn is None:

warnings.warn(warning_msg.format('y_true_observed'))
(population_y_true_observed_sd,
population_y_true_observed_mn) = (np.std(y_true_observed, ddof=ddof),
np.mean(y_true_observed))

if not population_y_pred_mn or not population_y_pred_sd:
if population_y_pred_mn is None:

warnings.warn(warning_msg.format('y_pred'))
(population_y_pred_sd,
population_y_pred_mn) = (np.std(y_pred, ddof=ddof),
np.mean(y_pred))

# if any of the standard deviations equal zero
# raise a warning and return None.
# We use np.isclose since sometimes sd for float
# values is a value very close to 0.
# We use the same tolerance as used for identifying
# features with zero standard deviation

if np.isclose(population_y_pred_sd, 0, atol=1e-07) \
or np.isclose(population_y_true_observed_sd, 0, atol=1e-07):
warnings.warn("Population standard deviations for the computation of "
"DSM is zero. No value will be computed.")
return None


# calculate the z-scores for observed and predicted
y_true_observed_subgroup_z = ((y_true_observed - population_y_true_observed_mn) /
population_y_true_observed_sd)
Expand Down
@@ -0,0 +1,12 @@
{
"predictions_file": "../../files/predictions_same_system_score_with_subgroups_subset_double_scored.csv",
"system_score_column": "score",
"description": "An evaluation of LinearRegression predictions.",
"human_score_column": "h1",
"second_human_score_column": "h2",
"id_column": "id",
"experiment_id": "lr_eval_system_score_constant",
"subgroups": "QUESTION, L1",
"trim_min": 1,
"trim_max": 6
}
@@ -0,0 +1,6 @@
,1,2,3,4,5
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,1,9,18,34,7
5,0,0,0,0,0
@@ -0,0 +1,2 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,SMD
,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,-0.06622863508167891
@@ -0,0 +1,6 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329
Esperanto,12,3.5,1.0,2.0,5.0,3.25,0.8660254037844386,2.0,5.0,0.8922685978385125,0.85,0.6470588235294117,75.0,100.0,-0.13791461209462752
Klingon,11,3.4545454545454546,0.9341987329938276,2.0,5.0,3.6363636363636362,0.8090398349558905,2.0,5.0,0.9021097956087901,0.8720930232558141,0.721518987341772,81.81818181818183,100.0,0.34772515873397636
Navi,11,3.6363636363636362,0.8090398349558905,2.0,5.0,3.6363636363636362,0.6741998624632421,2.0,4.0,0.8333333333333335,0.8196721311475409,0.6666666666666666,81.81818181818183,100.0,0.14585044666295055
Vulcan,15,3.3333333333333335,1.1126972805283737,1.0,5.0,3.2,1.082325538564332,1.0,5.0,0.771046169254443,0.7647058823529412,0.2857142857142857,46.666666666666664,100.0,-0.009179051765826295
@@ -0,0 +1,4 @@
,N,h1_mean,h1_sd,h1_min,h1_max,h2_mean,h2_sd,h2_min,h2_max,corr,wtkappa,kappa,exact_agr,adj_agr,DSM
All data,49,3.4693877551020407,0.9596449173246002,1.0,5.0,3.4081632653061225,0.8878430639040038,1.0,5.0,0.8218820738677033,0.8175725986597168,0.5490797546012269,69.38775510204081,100.0,0.07421776607639329
QUESTION_1,40,3.5,0.9336995618478525,1.0,5.0,3.425,0.9026314805852884,1.0,5.0,0.8366627923660599,0.8333333333333334,0.5914577530176417,72.5,100.0,0.05919229464910923
QUESTION_2,9,3.3333333333333335,1.118033988749895,2.0,5.0,3.3333333333333335,0.8660254037844386,2.0,4.0,0.7745966692414833,0.75,0.3207547169811321,55.55555555555556,100.0,0.14099763908654464
@@ -0,0 +1,2 @@
partition,responses,QUESTION,L1
Evaluation,69,2,4
@@ -0,0 +1,5 @@
L1,N responses
Esperanto,15
Klingon,19
Navi,14
Vulcan,21
@@ -0,0 +1,3 @@
QUESTION,N responses
QUESTION_1,40
QUESTION_2,29
@@ -0,0 +1,4 @@
,corr,kappa,wtkappa,exact_agr,adj_agr,SMD
raw,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708
raw_trim,,-0.5490797546012269,-0.8175725986597168,-69.38775510204081,-24.637681159420282,0.06597117168077708
raw_trim_round,,-0.5490797546012269,-0.8175725986597168,-20.112392783200228,-14.492753623188406,0.5811554368860343
@@ -0,0 +1,4 @@
,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
raw,,0.8218820738677033,0.9065771196471392,
raw_trim,,0.8218820738677033,0.9065771196471392,
raw_trim_round,,0.8218820738677033,0.9065771196471392,
@@ -0,0 +1,6 @@
,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
All data,,0.8218820738677033,0.9065771196471392,
Esperanto,,0.8922685978385125,0.9445997024340589,
Klingon,,0.9021097956087901,0.9497946070644906,
Navi,,0.8333333333333335,0.9128709291752769,
Vulcan,,0.771046169254443,0.8780923466552041,
@@ -0,0 +1,4 @@
,corr_HM,corr_HH,sqrt_HH,corr_disattenuated
All data,,0.8218820738677033,0.9065771196471392,
QUESTION_1,,0.8366627923660599,0.9146927311212547,
QUESTION_2,,0.7745966692414833,0.8801117367933934,
@@ -0,0 +1,4 @@
,N,h_mean,h_sd,h_min,h_max,sys_mean,sys_sd,sys_min,sys_max,corr,wtkappa,R2,kappa,exact_agr,adj_agr,SMD,RMSE
raw,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266
raw_trim,69,3.536231884057971,0.9006486248471405,1.0,5.0,3.5360000000000005,4.473426541494861e-16,3.536,3.536,,1.787683880684223e-31,-6.726221779551622e-08,0.0,0.0,75.36231884057972,-0.0002574634009018279,0.8940983961673266
raw_trim_round,69,3.536231884057971,0.9006486248471405,1.0,5.0,4.0,0.0,4.0,4.0,,0.0,-0.2690488702049396,0.0,49.275362318840585,85.5072463768116,0.5149268018043555,1.0072203103706698
@@ -0,0 +1,6 @@
,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round
All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,
Esperanto,15.0,3.533333333333333,0.9154754164341269,3.5360000000000005,4.596760034896314e-16,-1.3446370462899402e-31,,,0.884437297570231,-9.090909091069577e-06,4.0,0.0,0.0,40.0,86.66666666666667,
Klingon,19.0,3.5789473684210527,0.837707816583391,3.536,0.0,0.0,,,0.8164952057744317,-0.0027744000000000657,4.0,0.0,0.0,47.368421052631575,89.47368421052632,
Navi,14.0,3.642857142857143,0.744946343668492,3.5360000000000005,4.608531526730982e-16,-1.0697660707751459e-31,,,0.7257579289464026,-0.022158574257425956,4.0,0.0,0.0,57.14285714285714,92.85714285714286,
Vulcan,21.0,3.4285714285714284,1.0757057484009542,3.536,0.0,0.0,,,1.0552638126486258,-0.010472296296296202,4.0,0.0,0.0,52.38095238095239,76.19047619047619,
@@ -0,0 +1,4 @@
,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,DSM.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,DSM.raw_trim_round
All data,69.0,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,
QUESTION_1,40.0,3.5,0.9336995618478525,3.536,0.0,0.0,,,0.9226570327050024,-0.0015247058823528725,4.0,0.0,0.0,47.5,85.0,
QUESTION_2,29.0,3.586206896551724,0.8667361346416773,3.5360000000000005,4.519498061120574e-16,7.474667922401844e-32,,,0.8531399206097032,-0.003475304918032718,4.0,0.0,0.0,51.724137931034484,86.20689655172413,
@@ -0,0 +1,2 @@
,N,h_mean,h_sd,sys_mean.raw_trim,sys_sd.raw_trim,wtkappa.raw_trim,corr.raw_trim,SMD.raw_trim,RMSE.raw_trim,R2.raw_trim,sys_mean.raw_trim_round,sys_sd.raw_trim_round,kappa.raw_trim_round,exact_agr.raw_trim_round,adj_agr.raw_trim_round,SMD.raw_trim_round
0,69,3.536231884057971,0.9006486248471405,3.5360000000000005,4.473426541494861e-16,1.787683880684223e-31,,-0.0002574634009018279,0.8940983961673266,-6.726221779551622e-08,4.0,0.0,0.0,49.275362318840585,85.5072463768116,0.5149268018043555
@@ -0,0 +1,70 @@
spkitemid,sc1,raw,raw_trim,raw_trim_round
RESPONSE_1,4.0,3.536,3.536,4
RESPONSE_2,4.0,3.536,3.536,4
RESPONSE_3,4.0,3.536,3.536,4
RESPONSE_4,3.0,3.536,3.536,4
RESPONSE_5,2.0,3.536,3.536,4
RESPONSE_6,2.0,3.536,3.536,4
RESPONSE_7,4.0,3.536,3.536,4
RESPONSE_8,2.0,3.536,3.536,4
RESPONSE_9,5.0,3.536,3.536,4
RESPONSE_10,3.0,3.536,3.536,4
RESPONSE_11,4.0,3.536,3.536,4
RESPONSE_12,5.0,3.536,3.536,4
RESPONSE_13,4.0,3.536,3.536,4
RESPONSE_14,4.0,3.536,3.536,4
RESPONSE_15,3.0,3.536,3.536,4
RESPONSE_16,4.0,3.536,3.536,4
RESPONSE_17,4.0,3.536,3.536,4
RESPONSE_18,3.0,3.536,3.536,4
RESPONSE_19,2.0,3.536,3.536,4
RESPONSE_20,4.0,3.536,3.536,4
RESPONSE_21,4.0,3.536,3.536,4
RESPONSE_22,5.0,3.536,3.536,4
RESPONSE_23,3.0,3.536,3.536,4
RESPONSE_24,3.0,3.536,3.536,4
RESPONSE_25,4.0,3.536,3.536,4
RESPONSE_26,4.0,3.536,3.536,4
RESPONSE_27,4.0,3.536,3.536,4
RESPONSE_28,4.0,3.536,3.536,4
RESPONSE_29,3.0,3.536,3.536,4
RESPONSE_30,3.0,3.536,3.536,4
RESPONSE_31,3.0,3.536,3.536,4
RESPONSE_32,3.0,3.536,3.536,4
RESPONSE_33,1.0,3.536,3.536,4
RESPONSE_34,4.0,3.536,3.536,4
RESPONSE_35,4.0,3.536,3.536,4
RESPONSE_36,3.0,3.536,3.536,4
RESPONSE_37,4.0,3.536,3.536,4
RESPONSE_38,5.0,3.536,3.536,4
RESPONSE_39,2.0,3.536,3.536,4
RESPONSE_40,4.0,3.536,3.536,4
RESPONSE_41,4.0,3.536,3.536,4
RESPONSE_42,4.0,3.536,3.536,4
RESPONSE_43,4.0,3.536,3.536,4
RESPONSE_44,3.0,3.536,3.536,4
RESPONSE_45,2.0,3.536,3.536,4
RESPONSE_46,2.0,3.536,3.536,4
RESPONSE_47,4.0,3.536,3.536,4
RESPONSE_48,2.0,3.536,3.536,4
RESPONSE_49,5.0,3.536,3.536,4
RESPONSE_50,3.0,3.536,3.536,4
RESPONSE_51,4.0,3.536,3.536,4
RESPONSE_52,5.0,3.536,3.536,4
RESPONSE_53,4.0,3.536,3.536,4
RESPONSE_54,4.0,3.536,3.536,4
RESPONSE_55,3.0,3.536,3.536,4
RESPONSE_56,4.0,3.536,3.536,4
RESPONSE_57,4.0,3.536,3.536,4
RESPONSE_58,3.0,3.536,3.536,4
RESPONSE_59,2.0,3.536,3.536,4
RESPONSE_60,4.0,3.536,3.536,4
RESPONSE_61,4.0,3.536,3.536,4
RESPONSE_62,5.0,3.536,3.536,4
RESPONSE_63,3.0,3.536,3.536,4
RESPONSE_64,3.0,3.536,3.536,4
RESPONSE_65,4.0,3.536,3.536,4
RESPONSE_66,4.0,3.536,3.536,4
RESPONSE_67,4.0,3.536,3.536,4
RESPONSE_68,4.0,3.536,3.536,4
RESPONSE_69,3.0,3.536,3.536,4
@@ -0,0 +1,6 @@
,score,human,sys_raw,difference
1.0,1.0,1.4492753623188406,0.0,-1.4492753623188406
2.0,2.0,13.043478260869565,0.0,-13.043478260869565
3.0,3.0,26.08695652173913,0.0,-26.08695652173913
4.0,4.0,49.275362318840585,100.0,50.724637681159415
5.0,5.0,10.144927536231885,0.0,-10.144927536231885
@@ -0,0 +1,4 @@
numeric system score,non-numeric system score
-,0
0,0
0,0
@@ -0,0 +1,70 @@
spkitemid,sc1,sc2
RESPONSE_1,4.0,4.0
RESPONSE_2,4.0,4.0
RESPONSE_3,4.0,3.0
RESPONSE_4,3.0,4.0
RESPONSE_5,2.0,3.0
RESPONSE_6,2.0,2.0
RESPONSE_7,4.0,4.0
RESPONSE_8,2.0,2.0
RESPONSE_9,5.0,4.0
RESPONSE_10,3.0,3.0
RESPONSE_11,4.0,4.0
RESPONSE_12,5.0,5.0
RESPONSE_13,4.0,3.0
RESPONSE_14,4.0,4.0
RESPONSE_15,3.0,4.0
RESPONSE_16,4.0,4.0
RESPONSE_17,4.0,4.0
RESPONSE_18,3.0,3.0
RESPONSE_19,2.0,3.0
RESPONSE_20,4.0,4.0
RESPONSE_21,4.0,3.0
RESPONSE_22,5.0,5.0
RESPONSE_23,3.0,3.0
RESPONSE_24,3.0,3.0
RESPONSE_25,4.0,4.0
RESPONSE_26,4.0,4.0
RESPONSE_27,4.0,4.0
RESPONSE_28,4.0,4.0
RESPONSE_29,3.0,3.0
RESPONSE_30,3.0,3.0
RESPONSE_31,3.0,3.0
RESPONSE_32,3.0,3.0
RESPONSE_33,1.0,1.0
RESPONSE_34,4.0,4.0
RESPONSE_35,4.0,3.0
RESPONSE_36,3.0,3.0
RESPONSE_37,4.0,4.0
RESPONSE_38,5.0,5.0
RESPONSE_39,2.0,1.0
RESPONSE_40,4.0,3.0
RESPONSE_41,4.0,4.0
RESPONSE_42,4.0,4.0
RESPONSE_43,4.0,3.0
RESPONSE_44,3.0,4.0
RESPONSE_45,2.0,3.0
RESPONSE_46,2.0,2.0
RESPONSE_47,4.0,4.0
RESPONSE_48,2.0,2.0
RESPONSE_49,5.0,4.0
RESPONSE_50,3.0,
RESPONSE_51,4.0,
RESPONSE_52,5.0,
RESPONSE_53,4.0,
RESPONSE_54,4.0,
RESPONSE_55,3.0,
RESPONSE_56,4.0,
RESPONSE_57,4.0,
RESPONSE_58,3.0,
RESPONSE_59,2.0,
RESPONSE_60,4.0,
RESPONSE_61,4.0,
RESPONSE_62,5.0,
RESPONSE_63,3.0,
RESPONSE_64,3.0,
RESPONSE_65,4.0,
RESPONSE_66,4.0,
RESPONSE_67,4.0,
RESPONSE_68,4.0,
RESPONSE_69,3.0,

0 comments on commit 020dde0

Please sign in to comment.