In [None]:
raw_or_scaled = "scaled" if use_scaled_predictions else "raw"
true_eval_file = join(output_dir, '{}_true_score_eval.{}'.format(experiment_id, file_format))
if exists(true_eval_file): 
    df_true_eval = DataReader.read_from_file(true_eval_file, index_col=0)
    df_true_eval.replace({np.nan: '-'}, inplace=True)
    variance_columns = ['N','N_single','N_double','h1_var_single','h1_var_double', 'h2_var_double','true_var']
    prmse_columns = ['N','N_single', 'N_double','sys_var_single','sys_var_double','mse_true','prmse_true']
    df_human_variance = df_true_eval[variance_columns].iloc[0:1]
    df_human_variance.index = ['human']
    df_prmse = df_true_eval[prmse_columns]
    
    markdown_strs = ['### True score evaluations']
    markdown_strs.append("The tables in this section show how well system scores can "
                        "predict *true* scores. According to Test theory, a *true* score "
                        "is a score that would have been obtained if there were no errors "
                        "in measurement. While true scores cannot be observed, the variance "
                        "of true scores and the prediction error can be estimated using observed "
                        "human scores when multiple human ratings are available for a subset of "
                        "responses. In this notebook these are estimated using human scores for "
                        "responses in the evaluation set.")
    
    if context == 'rsmtool':
        label_column = "test_label_column"
    else:
        label_column = "human_score_column"

    markdown_strs.append("\n Note that the analyses in this section assume that the values "
                        "in `{}` and `second_human_score_column` are independent scores "
                        "from different raters or groups of raters. These analyses are "
                        "not applicable to a situation where `{}` contains an average "
                        "score from multiple raters".format(label_column, label_column))
    
    markdown_strs.append("#### Variance of human scores")
    markdown_strs.append("The table below shows variance of both sets of human scores "
                        "for the whole evaluation set and for the subset of responses "
                        "that were double-scored. Large differences in variance between "
                        "the two human scores require further investigation. The last column "
                        "shows estimated true score variance. ")
    display(Markdown('\n'.join(markdown_strs)))
    pd.options.display.width=10
    display(HTML('<span style="font-size:95%">'+ df_human_variance.to_html(classes=['sortable'], 
                                                               escape=False,
                                                               float_format=float_format_func) + '</span>'))
    
    markdown_strs = ["#### Proportional reduction in mean squared error (PRMSE)"]
    markdown_strs.append("The table shows the variance of system scores for single-scored "
                        "and double-scored responses, and mean squared error (MSE) and "
                        "proportional reduction in mean squared error (PRMSE) for "
                        "predicting a true score with system score. As for other evaluations, "
                        "these results are computed on the evaluation set. `raw_trim` scores "
                        "are truncated to [{}, {}]. `raw_trim_round` scores are computed "
                        "by first truncating and then rounding the predicted score. Scaled scores "
                        "are computed by re-scaling the predicted scores using mean and standard "
                        "deviation of human scores as observed on the training data and mean and "
                        "standard deviation of machine scores as predicted for the training set.".format(min_score,
                                                                                                         max_score))
    display(Markdown('\n'.join(markdown_strs)))
    pd.options.display.width=10
    display(HTML('<span style="font-size:95%">'+ df_prmse.to_html(classes=['sortable'], 
                                                               escape=False,
                                                               float_format=float_format_func) + '</span>'))
    