In [None]:
# Setting options for the plots
%matplotlib inline
%config InlineBackend.figure_formats={'retina', 'svg'}
%config InlineBackend.rc={'savefig.dpi': 150}

# Summary Report 

In [None]:
import itertools
import json
import os
import re
import pickle
import platform
import time

from collections import defaultdict as dd
from functools import partial
from os.path import abspath, dirname, exists, join
from string import Template

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from matplotlib import pyplot as plt

from IPython import sys_info
from IPython.display import display, HTML, Image, Javascript, Markdown, SVG

from rsmtool.utils.files import (get_output_directory_extension,
                                 parse_json_with_comments)
from rsmtool.utils.notebook import (float_format_func,
                                    int_or_float_format_func,
                                    bold_highlighter,
                                    color_highlighter,
                                    show_thumbnail)

from rsmtool.reader import DataReader
from rsmtool.writer import DataWriter
from rsmtool.version import VERSION as rsmtool_version

# turn off interactive plotting
plt.ioff()

In [None]:
rsm_report_dir = os.environ.get('RSM_REPORT_DIR', None)
if rsm_report_dir is None:
    rsm_report_dir = os.getcwd()

rsm_environ_config = join(rsm_report_dir, '.environ.json')
if not exists(rsm_environ_config):
    raise FileNotFoundError('The file {} cannot be located. '
                            'Please make sure that either (1) '
                            'you have set the correct directory with the `RSM_REPORT_DIR` '
                            'environment variable, or (2) that your `.environ.json` '
                            'file is in the same directory as your notebook.'.format(rsm_environ_config))
    
environ_config = parse_json_with_comments(rsm_environ_config)

<style type="text/css">
  div.prompt.output_prompt { 
    color: white; 
  }
  
  span.highlight_color {
    color: red;
  }
  
  span.highlight_bold {
    font-weight: bold;  
  }
    
  @media print {
    @page {
      size: landscape;
      margin: 0cm 0cm 0cm 0cm;
    }

    * {
      margin: 0px;
      padding: 0px;
    }

    #toc {
      display: none;
    }

    span.highlight_color, span.highlight_bold {
        font-weight: bolder;
        text-decoration: underline;
    }

    div.prompt.output_prompt {
      display: none;
    }
    
    h3#Python-packages, div#packages {
      display: none;
  }
</style>

In [None]:
# NOTE: you will need to set the following manually
# if you are using this notebook interactively.
summary_id = environ_config.get('SUMMARY_ID')
description = environ_config.get('DESCRIPTION')
jsons = environ_config.get('JSONS')
output_dir = environ_config.get('OUTPUT_DIR')
use_thumbnails = environ_config.get('USE_THUMBNAILS')
file_format_summarize = environ_config.get('FILE_FORMAT')

# groups for subgroup analysis.
groups_desc = environ_config.get('GROUPS_FOR_DESCRIPTIVES') 
groups_eval = environ_config.get('GROUPS_FOR_EVALUATIONS') 

# javascript path
javascript_path = environ_config.get("JAVASCRIPT_PATH")

In [None]:
# initialize id generator for thumbnails
id_generator = itertools.count(1)

In [None]:
with open(join(javascript_path, "sort.js"), "r", encoding="utf-8") as sortf:
    display(Javascript(data=sortf.read()))

In [None]:
# load the information about all models
model_list = []
for (json_file, experiment_name) in jsons:
    model_config = json.load(open(json_file))
    model_id = model_config['experiment_id']
    model_name = experiment_name if experiment_name else model_id
    model_csvdir = dirname(json_file)
    model_file_format = get_output_directory_extension(model_csvdir, model_id)
    model_list.append((model_id, model_name, model_config, model_csvdir, model_file_format))


In [None]:
Markdown("This report presents the analysis for **{}**: {} \n ".format(summary_id, description))


In [None]:
HTML(time.strftime('%c'))

In [None]:
# get a matched list of model ids and descriptions
models_and_desc = zip([model_name for (model_id, model_name, config, csvdir, model_file_format) in model_list],
                      [config['description'] for (model_id, model_name, config, csvdir, file_format) in model_list])
model_desc_list = '\n\n'.join(['**{}**: {}'.format(m, d) for (m, d) in models_and_desc])

Markdown("The report compares the following models: \n\n {}".format(model_desc_list))

In [None]:
if use_thumbnails:
    display(Markdown("""***Note: Images in this report have been converted to """
                     """clickable thumbnails***"""))

In [None]:
%%html
<div id="toc"></div>

## Feature descriptives

In [None]:
def summarize_feature_correlations(model_list, file_suffix, header, file_format_summarize):
    corrs = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        corr_file = os.path.join(csvdir, '{}_{}.{}'.format(model_id, file_suffix, file_format))
        if os.path.exists(corr_file):
            model_corrs = DataReader.read_from_file(corr_file, index_col=0)
            model_corrs.index = [model_name]
            corrs.append(model_corrs)
    if not len(corrs) == 0:
        df_summ = pd.concat(corrs, sort=True)
        display(header)
        display(HTML(df_summ.to_html(index=True, classes = ['sortable'],
                                     escape=False,
                                     float_format=int_or_float_format_func)))

        writer = DataWriter(summary_id)
        writer.write_experiment_output(output_dir,
                                       {file_suffix: df_summ},
                                       index=True,
                                       file_format=file_format_summarize)

## Marginal and partial correlations

The tables below shows correlations between truncated and standardized (if applicable) values of each feature against human score for each model. All correlations are computed on the training sets.

In [None]:
header = Markdown("####Marginal corelations against score\n\n\n "
                  "The table shows marginal correlations between each feature "
                  "and the human score.")

summarize_feature_correlations(model_list, 'margcor_score_all_data', header, file_format_summarize)

In [None]:
header = Markdown("####Partial correlations after controlling for all other variables\n\n\n "
                  "This table shows Pearson's correlation between each feature and human score after "
                  "controlling for all other features")

summarize_feature_correlations(model_list, 'pcor_score_all_data', header, file_format_summarize)

In [None]:
header = Markdown("####Partial correlations after controlling for length\n\n\n "
                  "This table shows Pearson's correlation between each feature and human score after "
                  "controlling for length")

summarize_feature_correlations(model_list, 'pcor_score_no_length_all_data', header, file_format_summarize)

## Model

The table shows main model parameters for each experiment: the total number of features used in the model (linear models only), the number of features with negative coefficients (linear models only), the learner, and the label used to train the model.  For linear models, the second table shows standardized coefficients for all features. 

In [None]:
def summarize_models(model_list, file_format_summarize):
    
    writer = DataWriter(summary_id)
    
    summs = []
    betas = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        coef_file = join(csvdir, '{}_betas.{}'.format(model_id, file_format))
        if exists(coef_file):
            df_coefs = DataReader.read_from_file(coef_file)
            model_summary = pd.DataFrame({'N features': [len(df_coefs)],
                                          'N negative': len(df_coefs[df_coefs['standardized'] < 0]),
                                          'learner': config['model'],
                                          'train_label': config['train_label_column']},
                                         index=[model_name])
            summs.append(model_summary)
            df_betas = pd.DataFrame({model_name : df_coefs['standardized'].values},
                                     index = df_coefs['feature'].values)
            betas.append(df_betas)
        else:
            if 'model' in config:
                model_summary = pd.DataFrame({'N features': '-',
                                              'N negative': '-',
                                              'learner': config['model'],
                                              'train_label': config['train_label_column']},
                                            index=[model_name])
                summs.append(model_summary)
   
    if not len(summs) == 0:
        df_summ = pd.concat(summs, sort=True)
        display(Markdown("## Model summary"))
        display(HTML(df_summ[['N features', 'N negative',
                              'learner', 'train_label']].to_html(index=True, 
                                                                 classes = ['sortable'],
                                                                 escape=False,
                                                                 float_format=int_or_float_format_func)))

        writer.write_experiment_output(output_dir,
                                       {'model_summary': df_summ},
                                       index=True,
                                       file_format=file_format_summarize)
        
    if not len(betas) == 0:
        df_betas_all = pd.concat(betas, axis=1, sort=True)
        df_betas_all.fillna('-', inplace=True)
        display(Markdown("## Standardized coefficients"))
        display(HTML(df_betas_all.to_html(index=True, 
                                          classes = ['sortable'],
                                          escape=False,
                                          float_format=int_or_float_format_func)))

        writer.write_experiment_output(output_dir,
                                       {'betas': df_betas_all},
                                       index=True,
                                       file_format=file_format_summarize)

summarize_models(model_list, file_format_summarize)

In [None]:
def summarize_model_fit(file_format_summarize):
    fits = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        model_fit_file = join(csvdir, '{}_model_fit.{}'.format(model_id, file_format))
        if exists(model_fit_file):
            fit = DataReader.read_from_file(model_fit_file)
            fit.index = [model_name]
            fits.append(fit)
    if len(fits)>0:
        df_fit = pd.concat(fits, sort=True)
        display(Markdown("## Model fit"))
        display(HTML(df_fit[['N responses', 'N features',
                             'R2','R2_adjusted']].to_html(index=True,
                                                          classes=['sortable'],
                                                          escape=False,
                                                          float_format=int_or_float_format_func)))
    
        writer = DataWriter(summary_id)
        writer.write_experiment_output(output_dir,
                                       {'model_fit': df_fit},
                                       index=True,
                                       file_format=file_format_summarize)

    
summarize_model_fit(file_format_summarize)

## Evaluation results

### Overall association statistics

The tables in this section show the standard association metrics between human scores and different types of machine scores. These results are computed on the evaluation set. The scores for each model have been truncated to values indicated in `truncation range`. When indicated, scaled scores are computed by re-scaling the predicted scores using mean and standard deviation of human scores as observed on the training data and mean and standard deviation of machine scores as predicted for the training set. 


In [None]:
def read_evals(model_list, file_format_summarize):

    has_missing_trims = False

    evals = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        csv_file = os.path.join(csvdir, '{}_eval_short.{}'.format(model_id, file_format))
        if os.path.exists(csv_file):
            df_eval = DataReader.read_from_file(csv_file, index_col=0)
            df_eval.index = [model_name]
            
            # figure out whether the score was scaled
            df_eval['system score type'] = 'scale' if config.get('use_scaled_predictions') == True or config.get('scale_with') is not None else 'raw'        

            # we want to display the truncation range, but this is slightly complicated
            # we first check to see if the post-processing params file exists; if it does,
            # we grab the trim_min and trim_max values from that file (which still could be None!)            
            trim_min, trim_max = None, None
            postproc_file = os.path.join(csvdir, '{}_postprocessing_params.{}'.format(model_id, file_format))
            if os.path.exists(postproc_file):
                df_postproc = DataReader.read_from_file(postproc_file)
                trim_min = df_postproc['trim_min'].values[0]
                trim_max = df_postproc['trim_max'].values[0] 
    
            # if the trim_min or trim_max is still None, we then grab whatever is in the config
            trim_min = config.get('trim_min') if trim_min is None else trim_min
            trim_max = config.get('trim_max') if trim_max is None else trim_max
            
            # finally, we calculate the max and min scores; if we couldn't get any trim values,
            # then we default these to `?` and the set `has_missing_trims=True`
            if trim_min is None:
                min_score, has_missing_trims = '?', True
            else:
                min_score = float(trim_min) - config.get('trim_tolerance', 0.4998)
            if trim_max is None:
                max_score, has_missing_trims = '?', True
            else:
                max_score = float(trim_max) + config.get('trim_tolerance', 0.4998)        

            df_eval['truncation range'] = "[{}, {}]".format(min_score, max_score)
            
            # rename the columns to remove reference to scale/raw scores
            new_column_names = [col.split('.')[0] if not 'round' in col 
                                else '{} (rounded)'.format(col.split('.')[0])
                                for col in df_eval.columns ]
            df_eval.columns = new_column_names
            evals.append(df_eval)

    if len(evals) > 0:
        df_evals = pd.concat(evals, sort=True)
    else:
        df_evals = pd.DataFrame()
    return df_evals, has_missing_trims

df_eval, has_missing_trims = read_evals(model_list, file_format_summarize)

if has_missing_trims:
    display(Markdown('**Note:** The minimum and/or maximum scores after truncation could not be '
                     'be computed in some cases. This is because `trim_min` and/or `trim_max` '
                     'could not be found in either the configuration file or the postprocessing '
                     'parameters file. Scores that could not be computed are shown as `?`.'))
if not df_eval.empty:
    writer = DataWriter(summary_id)
    writer.write_experiment_output(output_dir,
                                   {'eval_short': df_eval},
                                   index=True,
                                   file_format=file_format_summarize)

#### Descriptive holistic score statistics

The table shows distributional properties of human and system scores. SMD values lower then -0.15 or higher than 0.15 are <span class="highlight_color">highlighted</span>.

In [None]:
pd.options.display.width=10
formatter = partial(color_highlighter, low=-0.15, high=0.15)
if not df_eval.empty:
     display(HTML(df_eval[['N', 'system score type', "truncation range", 'h_mean', 'h_sd', 
                           'sys_mean', 'sys_sd',  'SMD']].to_html(index=True,
                                                                  classes=['sortable'],
                                                                  escape=False,
                                                                  formatters={'SMD': formatter},
                                                                  float_format=int_or_float_format_func)))
else:
     display(Markdown("No information available for any of the models"))

#### Association statistics

The table shows the standard association metrics between human scores and machine scores. Note that some evaluations (`*_trim_round`) are based on rounded scores computed by first truncating and then rounding the predicted score.

In [None]:
if not df_eval.empty:
    wtkappa_col = 'wtkappa' if 'wtkappa' in df_eval else 'wtkappa (rounded)'
    display(HTML(df_eval[['N',
                          'system score type',
                          'corr', 'R2', 'RMSE',
                          wtkappa_col,
                          'kappa (rounded)',
                          'exact_agr (rounded)',
                          'adj_agr (rounded)']].to_html(index=True,
                                                        classes=['sortable'],
                                                        escape=False,
                                                        float_format=int_or_float_format_func)))
else:
    display(Markdown("No information available for any of the models"))

### True score evaluations

The tables in this section shows how well system scores can predict *true* scores. According to Test theory, a *true* score is a score that would have been obtained if there were no errors in measurement. While true scores cannot be observed, the variance of true scores and the prediction error can be estimated using observed human scores when multiple human ratings are available for a subset of responses. In this notebook, this variance and prediction error are estimated using human scores for responses in the evaluation set. 

In [None]:
prmse_columns = ['N','N raters', 'N single', 'N multiple', 
                 'Variance of errors', 'True score var',
                 'MSE true', 'PRMSE true']

def read_true_score_evals(model_list, file_format_summarize):
    true_score_evals = []
    for (model_id, model_name, config, csvdir, file_format) in model_list:
        csv_file = os.path.join(csvdir, '{}_true_score_eval.{}'.format(model_id, file_format))
        if os.path.exists(csv_file):
            df_true_score_eval_all = DataReader.read_from_file(csv_file, index_col=0)
            # figure out whether the score was scaled
            prefix = 'scale' if config.get('use_scaled_predictions') == True or config.get('scale_with') is not None else 'raw'        
            # use the line that corresponds to the appropriate score (scaled or raw)
            df_true_score_eval = df_true_score_eval_all.loc[['{}_trim'.format(prefix)]].copy()
            df_true_score_eval['system score type'] = prefix
            df_true_score_eval.index = [model_name]
            true_score_evals.append(df_true_score_eval)          
    if len(true_score_evals) > 0:
        df_true_score_evals = pd.concat(true_score_evals, sort=True)
    else:
        df_true_score_evals = pd.DataFrame()
    return(df_true_score_evals)

df_true_score_eval = read_true_score_evals(model_list, file_format_summarize)
if not df_true_score_eval.empty:
    writer = DataWriter(summary_id)
    writer.write_experiment_output(output_dir,
                                   {'true_score_eval': df_true_score_eval},
                                   index=True,
                                   file_format=file_format_summarize)

In [None]:
if not df_true_score_eval.empty:
    markdown_strs = ["#### Proportional reduction in mean squared error (PRMSE)"]
    markdown_strs.append("The table shows variance of human rater errors, "
                         "true score variance, mean squared error (MSE) and "
                         "proportional reduction in mean squared error (PRMSE) for "
                         "predicting a true score with system score.")
    display(Markdown('\n'.join(markdown_strs)))
    pd.options.display.width=10
    df_prmse = df_true_score_eval[prmse_columns].copy()
    df_prmse.replace({np.nan: '-'}, inplace=True)
    display(HTML('<span style="font-size:95%">'+ df_prmse.to_html(classes=['sortable'], 
                                                                  escape=False,
                                                                  float_format=float_format_func) + '</span>'))
else:
    display(Markdown("No information available for any of the models"))
    

## Links to intermediate files

Click on the hyperlinks below to see the intermediate experiment files generated as part of this summary. 

**Note**: This only includes the intermediate files generated by `rsmsummarize`. It does not include links to the intermediate files generated by the original experiment(s).

In [1]:
from rsmtool.utils.notebook import show_files

In [None]:
show_files(output_dir, summary_id, file_format_summarize)

## System information

In [None]:
system_name = platform.system()

# People might not know what 'Darwin' is, so we should replace that with 'Mac OS X'
if system_name == 'Darwin':
    system_name = 'Mac OS X'
    
# get the architecture
architecture = platform.architecture()[0]

# get the rsmtool version
rsmtool_version_str = '.'.join(map(str, rsmtool_version))

display(Markdown('This report was generated using rsmtool v{} on a '
                 '{} computer running {}.'.format(rsmtool_version_str, 
                                                  architecture, 
                                                  system_name)))

### Python packages

In [None]:
import pkg_resources
package_names = '\n'.join(sorted(["%s==%s" % (i.key, i.version) for i in pkg_resources.working_set]))
display(HTML('<div id="packages"><pre>{}</pre></div>'.format(package_names)))

In [None]:
%%javascript

// Code to dynamically generate table of contents at the top of the HTML file
var tocEntries = ['<ul>'];
var anchors = $('a.anchor-link');
var headingTypes = $(anchors).parent().map(function() { return $(this).prop('tagName')});
var headingTexts = $(anchors).parent().map(function() { return $(this).text()});
var subList = false;

$.each(anchors, function(i, anch) {
    var hType = headingTypes[i];
    var hText = headingTexts[i];
    hText = hText.substr(0, hText.length - 1);
    if (hType == 'H2') {
        if (subList) {
            tocEntries.push('</ul>')
            subList = false;
        }
        tocEntries.push('<li><a href="' + anch + '"</a>' + hText + '</li>')
    }
    else if (hType == 'H3') {
        if (!subList) {
            subList = true;
            tocEntries.push('<ul>')
        }
        tocEntries.push('<li><a href="' + anch + '"</a>' + hText + '</li>')
    }
});
tocEntries.push('</ul>')
$('#toc').html(tocEntries.join(' '))