In [None]:
consistency_file = join(output_dir, '{}_consistency.csv'.format(experiment_id))
degradation_file = join(output_dir, '{}_degradation.csv'.format(experiment_id))
if exists(consistency_file) and exists(degradation_file):
    df_consistency = pd.read_csv(consistency_file, index_col=0)
    df_degradation = pd.read_csv(degradation_file, index_col=0)
    df_eval = pd.read_csv(join(output_dir, '{}_eval.csv'.format(experiment_id)), index_col=0)
    markdown_strs = ['## Consistency']
    markdown_strs.append('### Human-human agreement')
    markdown_strs.append("This table shows the human-human agreement on the "
                         "double-scored evaluation data. The following are "
                         "highlighted in <span style='color: red'>red</span>: ")
    markdown_strs.append(' - Exact agreement (`exact_agr`) < 50%')
    markdown_strs.append(' - Adjacent agreement (`adj_agr`) < 95%')
    markdown_strs.append(' - Quadratic weighted kappa (`wtkappa`) < 0.7')
    markdown_strs.append(' - Pearson correlation (`corr`) < 0.7')
    display(Markdown('\n'.join(markdown_strs)))
    
    # display the HTML for the table with the various formatters
    formatter_exact_agr = partial(color_highlighter, low=50, high=100)
    formatter_adj_agr = partial(color_highlighter, low=95, high=100)
    formatter_wtkappa_corr = partial(color_highlighter, low=0.7)
    formatter_dict = {'exact_agr': formatter_exact_agr, 
                      'adj_agr': formatter_adj_agr,
                      'wtkappa': formatter_wtkappa_corr, 
                      'corr': formatter_wtkappa_corr}
    display(HTML(df_consistency.to_html(index=False,
                                        escape=False,
                                        float_format=float_format_func,
                                        formatters=formatter_dict)))
    
    markdown_strs = ['### Degradation']
    markdown_strs.append('The next table shows the degradation in the evaluation metrics '
                         '(`diff`) when comparing the machine (`H-M`) to a second human (`H-H`). '
                         'A positive degradation value indicates better human-machine performance. '
                         'Note that the human-machine agreement is computed on the full '
                         'dataset (to get a reliable estimate) whereas the human-human '
                         'agreement is computed on the subset of responses that were double-scored.')
    markdown_strs.append("\nThe following degradation values are highlighted in "
                         "<span style='color: red'>red</span>: ")
    markdown_strs.append(' - `corr` < -0.1')
    markdown_strs.append(' - `wtkappa` < -0.1')
    display(Markdown('\n'.join(markdown_strs)))
    df_eval_for_degradation = df_eval[df_degradation.columns].copy()
    df_consistency_for_degradation = pd.concat([df_consistency]*len(df_eval))
    df_consistency_for_degradation = df_consistency_for_degradation[df_degradation.columns].copy()
    df_consistency_for_degradation.index = df_eval_for_degradation.index

    df_consistency_for_degradation['type'] = 'H-H'
    df_eval_for_degradation['type'] = 'H-M'
    df_degradation['type'] = 'diff'

    df = pd.concat([df_consistency_for_degradation, df_eval_for_degradation, df_degradation])
    df = df[['type','corr', 'kappa', 'wtkappa', 'exact_agr', 'adj_agr', 'SMD']]
    df = df.reset_index()
    df = df.set_index(['index', 'type']).sortlevel('index')
    df.index.names = [None, None]
    
    # display the HTML for the table with the various formatters
    formatter_corr = partial(color_highlighter, low=-0.1, high=100)
    formatter_wtkappa = partial(color_highlighter, low=-0.1, high=100)
    formatter_dict = {'corr': formatter_corr, 'wtkappa': formatter_wtkappa}
    display(HTML(df.to_html(float_format=float_format_func, 
                            formatters=formatter_dict, escape=False)))