In [None]:
# Setting options for the plots
%matplotlib inline
%config InlineBackend.figure_formats={'retina', 'svg'}
%config InlineBackend.rc={'savefig.dpi': 150}

# Comparison Report 

In [None]:
import argparse
import base64
import itertools
import json
import logging
import numpy as np
import os
import pandas as pd
import re
import sys
import time

from os.path import exists, join

from IPython import sys_info
from IPython.display import display, HTML, Image, Markdown, SVG

<style>
.alternate_colors3 tr:nth-of-type(3n+1) {background-color: #ffffff;}
.alternate_colors3 tr:nth-of-type(3n+2){background-color: #dddddd;}
.alternate_colors3 tr:nth-of-type(3n){background-color: #cccccc;}

.alternate_colors3_groups tr:nth-of-type(6n+1){background-color: #ffffff;}
.alternate_colors3_groups tr:nth-of-type(6n+2){background-color: #ffffff;}
.alternate_colors3_groups tr:nth-of-type(6n+3){background-color: #ffffff;}
.alternate_colors3_groups tr:nth-of-type(6n+4){background-color: #dddddd;}
.alternate_colors3_groups tr:nth-of-type(6n+5){background-color: #dddddd;}
.alternate_colors3_groups tr:nth-of-type(6n){background-color: #dddddd;}

.alternate_colors2 tr:nth-of-type(2n+1){background-color: #ffffff;}
.alternate_colors2 tr:nth-of-type(2n){background-color: #dddddd;}

td, th {
        padding-left: 5px;
        padding-right: 5px;
        border-top:0;
        border-bottom: 0;
    }

table {
    border: 0;
    border-collapse: collapse;
    text-align: right;
    font-size: 11pt;
}

.chunk {
    page-break-inside: avoid;
    position: relative;
    margin-top: 4em;
}

div.prompt.output_prompt { color: white; }
</style>

In [None]:
# NOTE: you will need to set the following manually
# if you are using this notebook interactively.
experiment_id_old = os.environ.get('EXPERIMENT_ID_OLD')
description_old = os.environ.get('DESCRIPTION_OLD')
output_dir_old = os.environ.get('OUTPUT_DIR_OLD')
figure_dir_old = os.environ.get('FIGURE_DIR_OLD')
scaled_old = os.environ.get('SCALED_OLD')
score_prefix_old = 'scale' if scaled_old == '1' else 'raw'

experiment_id_new = os.environ.get('EXPERIMENT_ID_NEW')
description_new = os.environ.get('DESCRIPTION_NEW')
output_dir_new = os.environ.get('OUTPUT_DIR_NEW')
figure_dir_new = os.environ.get('FIGURE_DIR_NEW')
scaled_new = os.environ.get('SCALED_NEW')
score_prefix_new = 'scale' if scaled_new == '1' else 'raw'

# groups for subgroup analysis.
# example: 'prompt%%subgroup1%%subgroup2' 
groups_desc_string = os.environ.get('GROUPS_FOR_DESCRIPTIVES') 
groups_desc = groups_desc_string.split('%%')
groups_eval_string = os.environ.get('GROUPS_FOR_EVALUATIONS') 
groups_eval = groups_eval_string.split('%%')

if len(groups_desc) == 1 and groups_desc[0] == '':
    groups_desc = []

if len(groups_eval) == 1 and groups_eval[0] == '':
    groups_eval = []

In [None]:
markdown_strs = ['This report presents a comparison of the following two experiments']
markdown_strs.append('')
markdown_strs.append('  Old Experiment ID: **{}**'.format(experiment_id_old))
markdown_strs.append('')
markdown_strs.append('  Description: {}'.format(description_old))
markdown_strs.append('')
markdown_strs.append('')
markdown_strs.append('  New Experiment ID: **{}**'.format(experiment_id_new))
markdown_strs.append('')
markdown_strs.append('  Description: {}'.format(description_new))
markdown_strs.append('')
Markdown('\n'.join(markdown_strs))

In [None]:
HTML(time.strftime('%c'))

In [None]:
%%html
<div id="toc"></div>

In [None]:
_df_eval_columns_existing_raw = ["N", "h_mean", "h_sd", 
                                 "sys_mean.raw_trim", 
                                 "sys_sd.raw_trim", 
                                 "corr.raw_trim", 
                                 "SMD.raw_trim", 
                                 "sys_mean.raw_trim_round", 
                                 "sys_sd.raw_trim_round", 
                                 "exact_agr.raw_trim_round", 
                                 "kappa.raw_trim_round", 
                                 "wtkappa.raw_trim_round", 
                                 "adj_agr.raw_trim_round", 
                                 "SMD.raw_trim_round",
                                 "r2.raw_trim",
                                 "RMSE.raw_trim"] 

_df_eval_columns_existing_scale = ["N", "h_mean", "h_sd", 
                                   "sys_mean.scale_trim", 
                                   "sys_sd.scale_trim", 
                                   "corr.scale_trim", 
                                   "SMD.scale_trim", 
                                   "sys_mean.scale_trim_round", 
                                   "sys_sd.scale_trim_round", 
                                   "exact_agr.scale_trim_round", 
                                   "kappa.scale_trim_round", 
                                   "wtkappa.scale_trim_round", 
                                   "adj_agr.scale_trim_round", 
                                   "SMD.scale_trim_round",
                                   "r2.scale_trim",
                                   "RMSE.scale_trim"] 


_df_eval_columns_renamed = ["N", "H1 mean", "H1 SD", 
                            "score mean(b)", 
                            "score SD(b)", 
                            "Pearson(b)", 
                            "SMD(b)", 
                            "score mean(br)", 
                            "score SD(br)", 
                            "Agmt.(br)", 
                            "K(br)", 
                            "QWK(br)", 
                            "Adj. Agmt.(br)", 
                            "SMD(br)",
                            "R2(b)",
                            "RMSE(b)"]
raw_renamedict = dict(zip(_df_eval_columns_existing_raw, _df_eval_columns_renamed))
scale_renamedict = dict(zip(_df_eval_columns_existing_scale, _df_eval_columns_renamed))

In [None]:
def float_format_func(x):
    return '{:.3f}'.format(x)
 
def int_or_float_format_func2(x):
    if float.is_integer(x):
        ans = '{}'.format(int(x))
    else:
        ans = '{:.2f}'.format(x)
    return ans

def int_or_float_format_func3(x):
    if float.is_integer(x):
        ans = '{}'.format(int(x))
    else:
        ans = '{:.3f}'.format(x)
    return ans

def corr_formatter2(x):
    rx = '{:.2f}'.format(x)
    ans = rx if x < 0.9 else '<span style="color: #FF0000">{}</span>'.format(rx)
    return ans

def corr_formatter3(x):
    rx = '{:.3f}'.format(x)
    ans = rx if x < 0.9 else '<span style="color: #FF0000">{}</span>'.format(rx)
    return ans

def factor_formatter3(x):
    rx = '{:.3f}'.format(x)
    ans = rx if abs(x) < 0.1 else '<span style="font-weight: bold;">{}</span>'.format(rx)
    return ans

def make_summary_stat_df(df):
    series = []
    for summary_func in [np.mean, np.std, np.median, np.min, np.max]:
        series.append(df.apply(summary_func))
    res = pd.concat(series, axis=1)
    res.columns = ['MEAN', 'SD', 'MEDIAN', 'MIN', 'MAX']
    return res

In [None]:
def load_rsmtool_output(csvdir, figdir, experiment_id, prefix):

    res = {}

    # feature distributions and the inter-feature correlations
    with open(join(figdir, '{}_distrib.svg'.format(experiment_id)), 'rb') as f:
        res['feature_distplots'] = base64.b64encode(f.read()).decode('utf-8')

    res['df_feature_cors'] = pd.read_csv(join(csvdir, '{}_cors_processed.csv'.format(experiment_id)), index_col=0)
        
    # df_scores
    df_scores = pd.read_csv(join(csvdir, '{}_pred_processed.csv'.format(experiment_id)),
                        converters = {'spkitemid':str})

    res['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]
    
    # model coefficients if present
    betas_file = join(csvdir, '{}_betas.csv'.format(experiment_id))
    if exists(betas_file):
        res['df_coef'] = pd.read_csv(betas_file, index_col=0)
        res['df_coef'].index.name = None
    
            
    # read in the model fit files if present
    model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
    if exists(model_fit_file):
        res['df_model_fit'] = pd.read_csv(model_fit_file)

    # human human agreement
    consistency_file = join(csvdir, '{}_consistency.csv'.format(experiment_id))

    # load if consistency file is present 
    if exists(consistency_file):
        df_consistency = pd.read_csv(consistency_file, index_col=0)
        res['df_consistency'] = df_consistency
    
    # degradation
    degradation_file = join(csvdir, "{}_degradation.csv".format(experiment_id))

    # load if degradation file is present
    if exists(degradation_file):
        df_degradation = pd.read_csv(degradation_file, index_col=0)
        res['df_degradation'] = df_degradation
        
        # df_eval without renaming for degradation
        df_eval_for_degradation = pd.read_csv(join(csvdir, "{}_eval.csv".format(experiment_id)), index_col = 0)
        res['df_eval_for_degradation'] = df_eval_for_degradation
        
    # use the raw columns or the scale columns depending on the prefix
    existing_eval_cols = _df_eval_columns_existing_raw if prefix == 'raw' else _df_eval_columns_existing_scale
    renamedict = raw_renamedict if prefix == 'raw' else scale_renamedict

    # read in the short version of the evaluation metrics for all data
    short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)", "Pearson(b)", "QWK(br)", "R2(b)", "RMSE(b)"]
    df_eval = pd.read_csv(join(csvdir, '{}_eval_short.csv'.format(experiment_id)), index_col=0)
    df_eval = df_eval[existing_eval_cols]
    df_eval = df_eval.rename(columns=renamedict)
    res['df_eval'] = df_eval[short_metrics_list]
    res['df_eval'].index.name = None
    
    # read in the evaluation metrics by subgroup, if we are asked to
    for group in groups_eval:
        df_eval = pd.read_csv(join(csvdir, '{}_eval_by_{}.csv'.format(experiment_id, group)), index_col=0)
        df_eval = df_eval[existing_eval_cols]
        df_eval = df_eval.rename(columns=renamedict)
        res['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list]
        res['df_eval_by_{}'.format(group)].index.name = None
        res['df_eval_by_{}_overview'.format(group)] = make_summary_stat_df(res['df_eval_by_{}'.format(group)])
    
        # set the ordering of mean/SD/SMD statistics 
        res['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean', 'H1 SD', 'score mean(br)', 'score SD(br)', 'score mean(b)', 'score SD(b)', 'SMD(br)', 'SMD(b)']]
        res['df_eval_by_{}_m_sd'.format(group)].index.name = None

    # read in the partial correlations vs. score for all data
    res['df_pcor_sc1'] = pd.read_csv(join(csvdir, '{}_pcor_score_all_data.csv'.format(experiment_id)), index_col=0)
    res['df_pcor_sc1_overview'] = make_summary_stat_df(res['df_pcor_sc1'])
    
    # read in the partial correlations by subgroups, if we are asked to
    for group in groups_eval:
        res['df_pcor_sc1_by_{}'.format(group)] = pd.read_csv(join(csvdir, '{}_pcor_score_by_{}.csv'.format(experiment_id, group)), index_col=0)
        res['df_pcor_sc1_{}_overview'.format(group)] = make_summary_stat_df(res['df_pcor_sc1_by_{}'.format(group)])

    # read in the marginal correlations vs. score for all data
    res['df_mcor_sc1'] = pd.read_csv(join(csvdir, '{}_margcor_score_all_data.csv'.format(experiment_id)), index_col=0)
    res['df_mcor_sc1_overview'] = make_summary_stat_df(res['df_mcor_sc1'])

    # read in the partial correlations by subgroups, if we are asked to
    for group in groups_eval:
        res['df_mcor_sc1_by_{}'.format(group)] = pd.read_csv(join(csvdir, '{}_margcor_score_by_{}.csv'.format(experiment_id, group)), index_col=0)
        res['df_mcor_sc1_{}_overview'.format(group)] = make_summary_stat_df(res['df_mcor_sc1_by_{}'.format(group)])

    res['df_pca'] = pd.read_csv(join(csvdir, '{}_pca.csv'.format(experiment_id)), index_col=0)
    res['df_pcavar'] = pd.read_csv(join(csvdir, '{}_pcavar.csv'.format(experiment_id)), index_col=0)
    res['df_descriptives'] = pd.read_csv(join(csvdir, '{}_feature_descriptives.csv'.format(experiment_id)), index_col=0)

    # this df contains only the number of features. this is used later for another two tables to show the number of features
    df_features_n_values = res['df_descriptives'][['N', 'min', 'max']]

    res['df_descriptives'] = res['df_descriptives'][['N', 'mean', 'std. dev.', 'skewness', 'kurtosis']]

    df_outliers = pd.read_csv(join(csvdir, '{}_feature_outliers.csv'.format(experiment_id)), index_col=0)
    df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                              'lower': 'Lower',
                                              'both': 'Both',
                                              'upperperc': 'Upper %',
                                              'lowerperc': 'Lower %',
                                              'bothperc': 'Both %'})
    df_outliers_columns = df_outliers.columns.tolist()
    res['df_outliers'] = df_outliers

    # join with df_features_n_values to get the value of N
    res['df_outliers'] = pd.merge(res['df_outliers'], df_features_n_values, left_index=True, right_index=True)[['N'] + df_outliers_columns]

    # join with df_features_n_values to get the value of N
    res['df_percentiles'] = pd.read_csv(join(csvdir, '{}_feature_descriptivesExtra.csv'.format(experiment_id)), index_col=0)
    res['df_percentiles'] = pd.merge(res['df_percentiles'], df_features_n_values, left_index=True, right_index=True)

    res['df_percentiles']["Mild outliers (%)"] = res['df_percentiles']["Mild outliers"]/res['df_percentiles']["N"].astype(float)*100
    res['df_percentiles']["Extreme outliers (%)"] = res['df_percentiles']["Extreme outliers"]/res['df_percentiles']["N"].astype(float)*100

    res['df_percentiles'] = res['df_percentiles'][['N', 'min', 'max', '1%', '5%', '25%', '50%', '75%', '95%', '99%', 'IQR', 'Mild outliers', 'Mild outliers (%)', 'Extreme outliers', 'Extreme outliers (%)']]

    res['df_confmatrix'] = pd.read_csv(join(csvdir, '{}_confMatrix.csv'.format(experiment_id)), index_col=0)
    confmatrix_size = res['df_confmatrix'].shape[0]
    res['df_confmatrix'].index = ['machine {}'.format(n) for n in range(1, confmatrix_size + 1)]
    res['df_confmatrix'].columns = ['human {}'.format(x) for x in range(1, confmatrix_size + 1)]

    df_score_dist = pd.read_csv(join(csvdir, '{}_score_dist.csv'.format(experiment_id)), index_col=1)
    df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True)
    res['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']]

    # read in the feature boxplots by subgroup, if we were asked to
    for group in groups_eval:
        feature_boxplot_prefix = join(figdir, '{}_feature_boxplot_by_{}'.format(experiment_id, group))
        svg_file = join(feature_boxplot_prefix + '.svg')
        png_file = join(feature_boxplot_prefix + '.png')
        if exists(svg_file):
            with open(svg_file, 'rb') as f:
                res['feature_boxplots_by_{}_svg'.format(group)] = base64.b64encode(f.read()).decode('utf-8')
        elif exists(png_file):
            with open(png_file, 'rb') as f:
                res['feature_boxplots_by_{}_png'.format(group)] = base64.b64encode(f.read()).decode('utf-8')

    # read in the betas image if exists
    betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id))
    if exists(betas_svg):
        with open(betas_svg, 'rb') as f:
            res['betas'] = base64.b64encode(f.read()).decode('utf-8')    

    # read in the evaluation barplots by subgroup, if we were asked to
    for group in groups_eval:
        eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group))
        with open(eval_barplot_svg_file, 'rb') as f:
                res['eval_barplot_by_{}'.format(group)] = base64.b64encode(f.read()).decode('utf-8')
        
    with open(join(figdir, '{}_pca.svg'.format(experiment_id)), 'rb') as f:
        res['pca_scree_plot'] = base64.b64encode(f.read()).decode('utf-8')

    return res

In [None]:
# load the two sets of RSMTool outputs
outputs_old = load_rsmtool_output(output_dir_old, figure_dir_old, experiment_id_old, prefix=score_prefix_old)
outputs_new = load_rsmtool_output(output_dir_new, figure_dir_new, experiment_id_new, prefix=score_prefix_new)

In [None]:
def combine_old_new_results(df_new, df_old, name):
    
    # combine the two dataframes and compute the difference
    df_diff = df_new - df_old

    # if the dataframe pertains to features, then add a fake column
    # to the old dataframe if the feature was added and, conversely,
    # to the new dataframe if the feature was removed.
    if name in ['descriptives', 'outliers', 'percentiles', 'coefs', 'feature_cors', 'eval_by_prompt_overview', 'pcor_sc1_overview', 'mcor_sc1_overview', 'pcor_log_dta_dtu_overview']:
        added_features = list(set(df_new.index).difference(df_old.index))
        removed_features = list(set(df_old.index).difference(df_new.index))
        for af in added_features:
            df_old.loc[af] = '-'
        for rf in removed_features:
            df_new.loc[rf] = '-'

    df_old['version'] = 'old'
    df_new['version'] = 'new'
    df_diff['version'] = 'change'

    tmp_df = pd.DataFrame(df_old, copy=True)
    tmp_df = tmp_df.append(df_new)
    tmp_df = tmp_df.append(df_diff)
    tmp_df.index.name = 'for'  
    tmp_df = tmp_df.reset_index().sort_values(by=['for', 'version'], ascending=[True, False]).set_index(tmp_df.index.names)
    tmp_df.index.name = None

    # put version first
    tmp_df = tmp_df[['version'] + [x for x in tmp_df.columns if x != 'version']]
    return tmp_df
    

In [None]:
out_dfs = {}

name_old_new = [('descriptives', outputs_old['df_descriptives'], outputs_new['df_descriptives']),
                ('outliers', outputs_old['df_outliers'], outputs_new['df_outliers']),
                ('feature_cors', outputs_old['df_feature_cors'], outputs_new['df_feature_cors']),
                ('percentiles', outputs_old['df_percentiles'], outputs_new['df_percentiles']),
                ('eval_overview', outputs_old['df_eval'], outputs_new['df_eval']),
                ('mcor_sc1', outputs_old['df_mcor_sc1'], outputs_new['df_mcor_sc1']),
                ('mcor_sc1_overview', outputs_old['df_mcor_sc1_overview'], outputs_new['df_mcor_sc1_overview']),
                ('pcor_sc1', outputs_old['df_pcor_sc1'], outputs_new['df_pcor_sc1']),
                ('pcor_sc1_overview', outputs_old['df_pcor_sc1_overview'], outputs_new['df_pcor_sc1_overview']),
                ('score_dist', outputs_old['df_score_dist'], outputs_new['df_score_dist'])]

# add the subgroup differences
for group in groups_eval:
    name_old_new.extend([('eval_by_{}'.format(group), outputs_old['df_eval_by_{}'.format(group)], outputs_new['df_eval_by_{}'.format(group)]),
                         ('eval_by_{}_m_sd'.format(group), outputs_old['df_eval_by_{}_m_sd'.format(group)], outputs_new['df_eval_by_{}_m_sd'.format(group)]),
                         ('eval_by_{}_overview'.format(group), outputs_old['df_eval_by_{}_overview'.format(group)], outputs_new['df_eval_by_{}_overview'.format(group)]),
                         ('mcor_sc1_by_{}'.format(group), outputs_old['df_mcor_sc1_by_{}'.format(group)], outputs_new['df_mcor_sc1_by_{}'.format(group)]),
                         ('pcor_sc1_by_{}'.format(group), outputs_old['df_pcor_sc1_by_{}'.format(group)], outputs_new['df_pcor_sc1_by_{}'.format(group)])])
for name, df_old, df_new in name_old_new:

    out_dfs[name] = combine_old_new_results(df_old, df_new, name)
    
# WARN IF THE OLD AND NEW DATASETS ARE NOT THE SAME SIZE
log_msgs = []
oldTrainN = outputs_old['df_descriptives']['N'][0]  # take the N from the descriptive stats for the first feature
newTrainN = outputs_new['df_descriptives']['N'][0]
if oldTrainN != newTrainN:
    log_msg = "WARNING: the training sets were different sizes.  old N: {}, new N: {}.".format(oldTrainN, newTrainN)
    log_msgs.append(log_msg)

if 'prompt' in groups_eval:
    oldTestN = np.sum(outputs_old['df_eval_by_prompt']['N'])  # sum N across prompts
    newTestN = np.sum(outputs_new['df_eval_by_prompt']['N'])
    if not np.all(oldTestN == newTestN):
        log_msg = "WARNING: the testing sets were different sizes.  old N: {}, new N: {}.".format(oldTestN, newTestN)
        log_msgs.append(log_msg)