In [None]:
markdown_strs = ['### True score evaluations']

raw_or_scaled = "scaled" if use_scaled_predictions else "raw"
true_eval_file = join(output_dir, '{}_true_score_eval.{}'.format(experiment_id, file_format))
if exists(true_eval_file): 
    df_true_eval = DataReader.read_from_file(true_eval_file, index_col=0)
    df_true_eval.replace({np.nan: '-'}, inplace=True)
    prmse_columns = ['N','N raters', 'N single', 'N multiple', 
                     'Variance of errors', 'True score var',
                     'MSE true', 'PRMSE true']
    df_prmse = df_true_eval[prmse_columns]

    markdown_strs.append("The tables in this section show how well system scores can "
                         "predict *true* scores. According to Test theory, a *true* score "
                         "is a score that would have been obtained if there were no errors "
                         "in measurement. While true scores cannot be observed, the variance "
                         "of true scores and the prediction error can be estimated using observed "
                         "human scores when multiple human ratings are available for a subset of "
                         "responses.")

    if rater_error_variance is None: 
        
        rater_variance_source = 'estimated'
        # if we estimated rater error variance from the data,
        # we display the variance of the two human raters
        # so that the user can verify there is no bias
        # We get that data from existing analyses
        df_human_variance = df_consistency[['N', 'h1_sd', 'h2_sd']].copy()
        df_human_variance['N_double'] = df_human_variance['N']
        df_human_variance['h1_var (double)'] = df_human_variance['h1_sd']**2
        df_human_variance['h2_var (double)'] = df_human_variance['h2_sd']**2
        df_human_variance['N_total'] = df_eval.iloc[0]['N']
        df_human_variance['h1_var (single)'] = df_eval.iloc[0]['h_sd']**2
        df_human_variance.index = ['human']


        if context == 'rsmtool':
            label_column = "test_label_column"
        else:
            label_column = "human_score_column"

        markdown_strs.append("In this notebook the variance of true scores is estimated using "
                             "the human ratings available for "
                             "responses in the evaluation set. Note that the analyses in this "
                             "section assume that the values "
                            "in `{}` and `second_human_score_column` are independent scores "
                            "from different raters or groups of raters. These analyses are "
                            "not applicable to a situation where `{}` contains an average "
                            "score from multiple raters".format(label_column, label_column))

        markdown_strs.append("#### Variance of human scores")
        markdown_strs.append("The table below shows variance of both sets of human scores "
                            "for the whole evaluation set and for the subset of responses "
                            "that were double-scored. Large differences in variance between "
                            "the two human scores require further investigation.")
        display(Markdown('\n'.join(markdown_strs)))
        pd.options.display.width=10
        column_order = ['N_total', 'N_double', 'h1_var (single)', 'h1_var (double)', 'h2_var (double)']
        display(HTML('<span style="font-size:95%">'+ df_human_variance[column_order].to_html(classes=['sortable'], 
                                                                                            escape=False,
                                                                                            float_format=float_format_func) + '</span>'))
    else:
        markdown_strs.append("In this notebook the variance of true scores was "
                            "estimated using the value of rater error variance "
                            "supplied by the user ({})".format(rater_error_variance))
        display(Markdown('\n'.join(markdown_strs)))
        rater_variance_source = 'supplied'
    
    
    markdown_strs = ["#### Proportional reduction in mean squared error (PRMSE)"]
    markdown_strs.append("The table shows {} variance of human rater errors, "
                         "true score variance, mean squared error (MSE) and "
                         "proportional reduction in mean squared error (PRMSE) for "
                         "predicting a true score with system score. As for other evaluations, "
                         "these results are computed on the evaluation set. `raw_trim` scores "
                         "are truncated to [{}, {}]. `raw_trim_round` scores are computed "
                         "by first truncating and then rounding the predicted score. Scaled scores "
                         "are computed by re-scaling the predicted scores using mean and standard "
                         "deviation of human scores as observed on the training data and mean and "
                         "standard deviation of machine scores as predicted for the training set.".format(rater_variance_source,
                                                                                                          min_score,
                                                                                                          max_score))
    display(Markdown('\n'.join(markdown_strs)))
    pd.options.display.width=10
    display(HTML('<span style="font-size:95%">'+ df_prmse.to_html(classes=['sortable'], 
                                                               escape=False,
                                                               float_format=float_format_func) + '</span>'))
else:
    markdown_strs.append("The configuration file did not specify "
                         "`second_human_score_column` or `rater_error_variance`. "
                         "At least one of these must be specified to compute "
                         "evaluations against true scores.")
    display(Markdown('\n'.join(markdown_strs)))