In [None]:
# Setting options for the plots
%matplotlib inline
%config InlineBackend.figure_formats={'retina', 'svg'}
%config InlineBackend.rc={'savefig.dpi': 150}

# Experiment Report 

In [None]:
import itertools
import os
import re
import pickle
import platform
import time

from functools import partial
from os.path import abspath, exists, join
from string import Template

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from matplotlib import pyplot as plt

from IPython import sys_info
from IPython.display import display, HTML, Image, Markdown, SVG

from rsmtool.version import VERSION as rsmtool_version

In [None]:
%%javascript

/* sortttable v2 from http://www.kryogenix.org/code/browser/sorttable */
function dean_addEvent(t,e,r){if(t.addEventListener)t.addEventListener(e,r,!1);else{r.$$guid||(r.$$guid=dean_addEvent.guid++),t.events||(t.events={});var o=t.events[e];o||(o=t.events[e]={},t["on"+e]&&(o[0]=t["on"+e])),o[r.$$guid]=r,t["on"+e]=handleEvent}}function removeEvent(t,e,r){t.removeEventListener?t.removeEventListener(e,r,!1):t.events&&t.events[e]&&delete t.events[e][r.$$guid]}function handleEvent(t){var e=!0;t=t||fixEvent(((this.ownerDocument||this.document||this).parentWindow||window).event);var r=this.events[t.type];for(var o in r)this.$$handleEvent=r[o],this.$$handleEvent(t)===!1&&(e=!1);return e}function fixEvent(t){return t.preventDefault=fixEvent.preventDefault,t.stopPropagation=fixEvent.stopPropagation,t}var stIsIE=!1;if(sorttable={init:function(){arguments.callee.done||(arguments.callee.done=!0,_timer&&clearInterval(_timer),document.createElement&&document.getElementsByTagName&&(sorttable.DATE_RE=/^(\d\d?)[\/\.-](\d\d?)[\/\.-]((\d\d)?\d\d)$/,forEach(document.getElementsByTagName("table"),function(t){-1!=t.className.search(/\bsortable\b/)&&sorttable.makeSortable(t)})))},makeSortable:function(t){if(0==t.getElementsByTagName("thead").length&&(the=document.createElement("thead"),the.appendChild(t.rows[0]),t.insertBefore(the,t.firstChild)),null==t.tHead&&(t.tHead=t.getElementsByTagName("thead")[0]),1==t.tHead.rows.length){sortbottomrows=[];for(var e=0;e<t.rows.length;e++)-1!=t.rows[e].className.search(/\bsortbottom\b/)&&(sortbottomrows[sortbottomrows.length]=t.rows[e]);if(sortbottomrows){null==t.tFoot&&(tfo=document.createElement("tfoot"),t.appendChild(tfo));for(var e=0;e<sortbottomrows.length;e++)tfo.appendChild(sortbottomrows[e]);delete sortbottomrows}headrow=t.tHead.rows[0].cells;for(var e=0;e<headrow.length;e++)headrow[e].className.match(/\bsorttable_nosort\b/)||(mtch=headrow[e].className.match(/\bsorttable_([a-z0-9]+)\b/),mtch&&(override=mtch[1]),headrow[e].sorttable_sortfunction=mtch&&"function"==typeof sorttable["sort_"+override]?sorttable["sort_"+override]:sorttable.guessType(t,e),headrow[e].sorttable_columnindex=e,headrow[e].sorttable_tbody=t.tBodies[0],dean_addEvent(headrow[e],"click",sorttable.innerSortFunction=function(){if(-1!=this.className.search(/\bsorttable_sorted\b/))return sorttable.reverse(this.sorttable_tbody),this.className=this.className.replace("sorttable_sorted","sorttable_sorted_reverse"),this.removeChild(document.getElementById("sorttable_sortfwdind")),sortrevind=document.createElement("span"),sortrevind.id="sorttable_sortrevind",sortrevind.innerHTML=stIsIE?'&nbsp<font face="webdings">5</font>':"&nbsp;&#x25B4;",void this.appendChild(sortrevind);if(-1!=this.className.search(/\bsorttable_sorted_reverse\b/))return sorttable.reverse(this.sorttable_tbody),this.className=this.className.replace("sorttable_sorted_reverse","sorttable_sorted"),this.removeChild(document.getElementById("sorttable_sortrevind")),sortfwdind=document.createElement("span"),sortfwdind.id="sorttable_sortfwdind",sortfwdind.innerHTML=stIsIE?'&nbsp<font face="webdings">6</font>':"&nbsp;&#x25BE;",void this.appendChild(sortfwdind);theadrow=this.parentNode,forEach(theadrow.childNodes,function(t){1==t.nodeType&&(t.className=t.className.replace("sorttable_sorted_reverse",""),t.className=t.className.replace("sorttable_sorted",""))}),sortfwdind=document.getElementById("sorttable_sortfwdind"),sortfwdind&&sortfwdind.parentNode.removeChild(sortfwdind),sortrevind=document.getElementById("sorttable_sortrevind"),sortrevind&&sortrevind.parentNode.removeChild(sortrevind),this.className+=" sorttable_sorted",sortfwdind=document.createElement("span"),sortfwdind.id="sorttable_sortfwdind",sortfwdind.innerHTML=stIsIE?'&nbsp<font face="webdings">6</font>':"&nbsp;&#x25BE;",this.appendChild(sortfwdind),row_array=[],col=this.sorttable_columnindex,rows=this.sorttable_tbody.rows;for(var t=0;t<rows.length;t++)row_array[row_array.length]=[sorttable.getInnerText(rows[t].cells[col]),rows[t]];row_array.sort(this.sorttable_sortfunction),tb=this.sorttable_tbody;for(var t=0;t<row_array.length;t++)tb.appendChild(row_array[t][1]);delete row_array}))}},guessType:function(t,e){sortfn=sorttable.sort_alpha;for(var r=0;r<t.tBodies[0].rows.length;r++)if(text=sorttable.getInnerText(t.tBodies[0].rows[r].cells[e]),""!=text){if(text.match(/^-?[£$¤]?[\d,.]+%?$/))return sorttable.sort_numeric;if(possdate=text.match(sorttable.DATE_RE)){if(first=parseInt(possdate[1]),second=parseInt(possdate[2]),first>12)return sorttable.sort_ddmm;if(second>12)return sorttable.sort_mmdd;sortfn=sorttable.sort_ddmm}}return sortfn},getInnerText:function(t){if(!t)return"";if(hasInputs="function"==typeof t.getElementsByTagName&&t.getElementsByTagName("input").length,null!=t.getAttribute("sorttable_customkey"))return t.getAttribute("sorttable_customkey");if("undefined"!=typeof t.textContent&&!hasInputs)return t.textContent.replace(/^\s+|\s+$/g,"");if("undefined"!=typeof t.innerText&&!hasInputs)return t.innerText.replace(/^\s+|\s+$/g,"");if("undefined"!=typeof t.text&&!hasInputs)return t.text.replace(/^\s+|\s+$/g,"");switch(t.nodeType){case 3:if("input"==t.nodeName.toLowerCase())return t.value.replace(/^\s+|\s+$/g,"");case 4:return t.nodeValue.replace(/^\s+|\s+$/g,"");case 1:case 11:for(var e="",r=0;r<t.childNodes.length;r++)e+=sorttable.getInnerText(t.childNodes[r]);return e.replace(/^\s+|\s+$/g,"");default:return""}},reverse:function(t){newrows=[];for(var e=0;e<t.rows.length;e++)newrows[newrows.length]=t.rows[e];for(var e=newrows.length-1;e>=0;e--)t.appendChild(newrows[e]);delete newrows},sort_numeric:function(t,e){return aa=parseFloat(t[0].replace(/[^0-9.-]/g,"")),isNaN(aa)&&(aa=0),bb=parseFloat(e[0].replace(/[^0-9.-]/g,"")),isNaN(bb)&&(bb=0),aa-bb},sort_alpha:function(t,e){return t[0]==e[0]?0:t[0]<e[0]?-1:1},sort_ddmm:function(t,e){return mtch=t[0].match(sorttable.DATE_RE),y=mtch[3],m=mtch[2],d=mtch[1],1==m.length&&(m="0"+m),1==d.length&&(d="0"+d),dt1=y+m+d,mtch=e[0].match(sorttable.DATE_RE),y=mtch[3],m=mtch[2],d=mtch[1],1==m.length&&(m="0"+m),1==d.length&&(d="0"+d),dt2=y+m+d,dt1==dt2?0:dt2>dt1?-1:1},sort_mmdd:function(t,e){return mtch=t[0].match(sorttable.DATE_RE),y=mtch[3],d=mtch[2],m=mtch[1],1==m.length&&(m="0"+m),1==d.length&&(d="0"+d),dt1=y+m+d,mtch=e[0].match(sorttable.DATE_RE),y=mtch[3],d=mtch[2],m=mtch[1],1==m.length&&(m="0"+m),1==d.length&&(d="0"+d),dt2=y+m+d,dt1==dt2?0:dt2>dt1?-1:1},shaker_sort:function(t,e){for(var r=0,o=t.length-1,n=!0;n;){n=!1;for(var s=r;o>s;++s)if(e(t[s],t[s+1])>0){var a=t[s];t[s]=t[s+1],t[s+1]=a,n=!0}if(o--,!n)break;for(var s=o;s>r;--s)if(e(t[s],t[s-1])<0){var a=t[s];t[s]=t[s-1],t[s-1]=a,n=!0}r++}}},document.addEventListener&&document.addEventListener("DOMContentLoaded",sorttable.init,!1),/WebKit/i.test(navigator.userAgent))var _timer=setInterval(function(){/loaded|complete/.test(document.readyState)&&sorttable.init()},10);window.onload=sorttable.init,dean_addEvent.guid=1,fixEvent.preventDefault=function(){this.returnValue=!1},fixEvent.stopPropagation=function(){this.cancelBubble=!0},Array.forEach||(Array.forEach=function(t,e,r){for(var o=0;o<t.length;o++)e.call(r,t[o],o,t)}),Function.prototype.forEach=function(t,e,r){for(var o in t)"undefined"==typeof this.prototype[o]&&e.call(r,t[o],o,t)},String.forEach=function(t,e,r){Array.forEach(t.split(""),function(o,n){e.call(r,o,n,t)})};var forEach=function(t,e,r){if(t){var o=Object;if(t instanceof Function)o=Function;else{if(t.forEach instanceof Function)return void t.forEach(e,r);"string"==typeof t?o=String:"number"==typeof t.length&&(o=Array)}o.forEach(t,e,r)}};

<style>
    div.prompt.output_prompt { color: white; }
</style>

In [None]:
# NOTE: you will need to set the following manually
# if you are using this notebook interactively.
experiment_id = os.environ.get('EXPERIMENT_ID')
description = os.environ.get('DESCRIPTION')
context = os.environ.get('CONTEXT')
train_file_location = os.environ.get('TRAIN_FILE_LOCATION')
test_file_location = os.environ.get('TEST_FILE_LOCATION')
output_dir = os.environ.get('OUTPUT_DIR')
figure_dir = os.environ.get('FIGURE_DIR')
model_name = os.environ.get('MODEL_NAME')
model_type = os.environ.get('MODEL_TYPE')
length_column = os.environ.get('LENGTH_COLUMN')
second_human_score_column = os.environ.get('H2_COLUMN')
scaled = os.environ.get('SCALED')
use_scaled_predictions = scaled == '1'
exclude_zero_scores = os.environ.get('EXCLUDE_ZEROS') == '1'
feature_subset_file = os.environ.get('FEATURE_SUBSET_FILE')

# groups for analysis by prompt or subgroup.
# set to 'prompt' for the standard analysis of 'prompt%%subgroup1%%subgroup2' for subgroup analysis.
groups_desc_string = os.environ.get('GROUPS_FOR_DESCRIPTIVES') 
groups_desc = groups_desc_string.split('%%')
groups_eval_string = os.environ.get('GROUPS_FOR_EVALUATIONS') 
groups_eval = groups_eval_string.split('%%')

In [None]:
Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description))

In [None]:
HTML(time.strftime('%c'))

In [None]:
%%html
<div id="toc"></div>

In [None]:
# Read in the training and testing features, both raw and pre-processed
# Make sure that the `spkitemid` column is read as a string

if exists(train_file_location):
    df_train_orig = pd.read_csv(train_file_location)

train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))
if exists(train_file):
    df_train = pd.read_csv(train_file, converters={'spkitemid': str})
    
train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id))    
if exists(train_metadata_file):
    df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})

train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))
if exists(train_other_columns_file):
    df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})

train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))
if exists(train_length_file):
    df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})
    
train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))
if exists(train_excluded_file):
    df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})
    
train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))
if exists(train_responses_with_excluded_flags_file):
    df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})
    
train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))    
if exists(train_preproc_file):
    df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})
    
if exists(test_file_location):
    df_test_orig = pd.read_csv(test_file_location)

test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))
if exists(test_file):
    df_test = pd.read_csv(test_file, converters={'spkitemid': str})

test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id))    
if exists(test_metadata_file):
    df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})
    
test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))
if exists(test_other_columns_file):
    df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})

test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))
if exists(test_human_scores_file):
    df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})
        
test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))
if exists(test_excluded_file):
    df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})
    
test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))
if exists(test_responses_with_excluded_flags_file):
    df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})

test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))
if exists(test_preproc_file):
    df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})

pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))
if exists(pred_preproc_file):
    df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})

feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))
if exists(feature_file):
    df_features = pd.read_csv(feature_file, converters={'spkitemid': str})
    features_used = [c for c in df_features.feature.values]
    
betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))
if exists(betas_file):
    df_betas = pd.read_csv(betas_file)
    
if exists(feature_subset_file):
    df_feature_subset_specs = pd.read_csv(feature_subset_file)
else:
    df_feature_subset_specs = None

# define float formatting functions
def float_format_func(x, prec=3):
    formatter_string = Template('{:.${prec}f}').substitute(prec=prec)
    return formatter_string.format(x)

def int_or_float_format_func(x, prec=3):
    if float.is_integer(x):
        ans = '{}'.format(int(x))
    else:
        ans = float_format_func(x, prec=prec)
    return ans

def bold_highlighter(x, low=0, high=1, prec=3, absolute=False):
    abs_x = abs(x) if absolute else x
    val = float_format_func(x, prec=prec)
    ans = '<span style="font-weight: bold;">{}</span>'.format(val) if abs_x < low or abs_x > high else val
    return ans

def color_highlighter(x, low=0, high=1, prec=3, color='red', absolute=False):
    abs_x = abs(x) if absolute else x
    val = float_format_func(x, prec=prec)
    ans = '<span style="color: {}">{}</span>'.format(color, val) if abs_x < low or abs_x > high else val
    return ans

## Description of the data

In [None]:
try:
    num_excluded_train = len(df_train_responses_with_excluded_flags)
except NameError:
    num_excluded_train = 0

try:
    num_excluded_test = len(df_test_responses_with_excluded_flags)
except NameError:
    num_excluded_test = 0

pct_excluded_train = round(100*num_excluded_train/len(df_train_orig), 2)
pct_excluded_test = round(100*num_excluded_test/len(df_test_orig), 2)

if (num_excluded_train != 0 or num_excluded_test != 0):
    display(Markdown("### Responses excluded due to flags"))

    display(Markdown("Total number of responses excluded due to flags:"))
    display(Markdown("Training set: {} responses ({:.1f}% of the original {} responses)".format(num_excluded_train, pct_excluded_train, len(df_train_orig))))
    display(Markdown("Evaluation set: {} responses ({:.1f}% of the original {} responses)".format(num_excluded_test, pct_excluded_test, len(df_test_orig))))


### Responses excluded due to non-numeric feature values or scores

In [None]:
try:
    num_missing_rows_train = len(df_train_excluded)
except NameError:
    num_missing_rows_train = 0
pct_missing_rows_train = 100*num_missing_rows_train/len(df_train_orig)

try:
    num_missing_rows_test = len(df_test_excluded)
except:
    num_missing_rows_test = 0
pct_missing_rows_test = 100*num_missing_rows_test/len(df_test_orig)

#### Training set

In [None]:
display(Markdown('Total number of excluded responses: {} ({:.1f}% of the original {})'.format(num_missing_rows_train, pct_missing_rows_train, len(df_train_orig))))
if num_missing_rows_train != 0:
    df_train_excluded_analysis = pd.read_csv(join(output_dir, '{}_train_excluded_composition.csv'.format(experiment_id)))
    display(HTML(df_train_excluded_analysis.to_html(classes=['sortable'], float_format=float_format_func, index=False)))       

#### Evaluation set

In [None]:
display(Markdown('Total number of excluded responses: {} ({:.1f}% of the original {})'.format(num_missing_rows_test, pct_missing_rows_test, len(df_test_orig))))
if num_missing_rows_test != 0:
    df_test_excluded_analysis = pd.read_csv(join(output_dir, '{}_test_excluded_composition.csv'.format(experiment_id)))
    display(HTML(df_test_excluded_analysis.to_html(classes=['sortable'], float_format=float_format_func, index=False)))

The rest of this report is based only on the responses used to build and train the model.

### Composition of the training and evaluation sets

In [None]:
# show the table showing candidate (speaker), prompt 
# and responses stats for training and test

# feature descriptives extra table
df_data_desc = pd.read_csv(join(output_dir, '{}_data_composition.csv'.format(experiment_id)))
display(HTML(df_data_desc.to_html(classes=['sortable'], float_format=float_format_func, index=False)))

try:
    num_double_scored_responses = len(df_test_human_scores[df_test_human_scores['sc2'].notnull()])
except NameError:
    pass
else:
    zeros_included_or_excluded = 'excluded' if exclude_zero_scores else 'included'
    display(Markdown("Total number of double scored responses" 
                     " used: {} (zeros {})".format(num_double_scored_responses,
                                                   zeros_included_or_excluded)))

## Overall descriptive feature statistics

These values are reported before transformations.

In [None]:
# feature descriptives table
df_desc = pd.read_csv(join(output_dir, '{}_feature_descriptives.csv'.format(experiment_id)), index_col=0)
HTML(df_desc.to_html(classes=['sortable'], float_format=float_format_func))

The following table shows additional statistics for the data. Quantiles are computed using type=3 method used in SAS. The mild outliers are defined as data points between [1.5, 3) \* IQR away from the nearest quartile. Extreme outliers are the data points >= 3 * IQR away from the nearest quartile.

### Prevalence of recoded cases

This sections shows the number and percentage of cases truncated to mean +/- 4 SD for each feature.

In [None]:
df_outliers = pd.read_csv(join(output_dir, '{}_feature_outliers.csv'.format(experiment_id)), index_col=0)
df_outliers.index.name = 'feature'
df_outliers = df_outliers.reset_index()
df_outliers = pd.melt(df_outliers, id_vars=['feature'])
df_outliers = df_outliers[df_outliers.variable.str.contains(r'[ulb].*?perc')]

# we need a higher aspect if we have more than 40 features
aspect = 3 if len(features_used) > 40 else 2

# colors for the plot
colors = sns.color_palette("Greys", 3)

# what's the largest value in the data frame
maxperc = df_outliers['value'].max()

# compute the limits for the graph
limits = (0, max(2.5, maxperc))

with sns.axes_style('whitegrid'):
    # create a barplot without a legend since we will manually
    # add one later
    p = sns.factorplot("feature", "value", "variable", kind="bar", 
                       palette=colors, data=df_outliers, size=3, 
                       aspect=aspect, legend=False)
    p.set_axis_labels('', '% cases truncated to mean +/- 4*sd')
    p.set_xticklabels(rotation=90)
    p.set(ylim=limits)
    
    # add a line at 2%
    axis = p.axes[0][0]
    axis.axhline(y=2.0, linestyle='--', linewidth=1.5, color='black')
    
    # add a legend with the right colors
    legend=axis.legend(('both', 'lower', 'upper'), title='', frameon=True, fancybox=True, ncol=3)
    legend.legendHandles[0].set_color(colors[0])
    legend.legendHandles[1].set_color(colors[1])
    plt.savefig(join(figure_dir, '{}_outliers.svg'.format(experiment_id)))

### Feature value distribution

In [None]:
# feature descriptives extra table
df_desce = pd.read_csv(join(output_dir, '{}_feature_descriptivesExtra.csv'.format(experiment_id)), index_col=0)
HTML(df_desce.to_html(classes=['sortable'], float_format=float_format_func))

##  Feature Distributions and Inter-feature Correlations

### Training set distributions

The following plot shows the distributions of the feature values in 
the training set, after transformation (if applicable), truncation 
and standardization. The line shows the kernel density estimate. The 
human score (`sc1`) is also included. 

Response length (`length`) is included if you specified `length_column` in the config file, unless
the column had missing values or a standard deviation <= 0.

In [None]:
selected_columns = features_used + ['sc1', 'spkitemid']
df_train_preproc_selected_features = df_train_preproc[selected_columns]
try:
    df_train_preproc_selected_features = df_train_preproc_selected_features.merge(df_train_length, on='spkitemid')
except NameError:
    column_order = sorted(features_used) + ['sc1']
else:
    column_order = sorted(features_used) + ['sc1', 'length']

df_train_preproc_melted = pd.melt(df_train_preproc_selected_features, id_vars=['spkitemid'])
df_train_preproc_melted = df_train_preproc_melted[['variable', 'value']]
with sns.axes_style('white'):
    g = sns.FacetGrid(col='variable', data=df_train_preproc_melted, col_wrap=3, 
                      col_order=column_order, sharex=False, sharey=False, size=2, 
                      aspect=1)
    g.map(sns.distplot, "value", color="grey")
    for ax, cname in zip(g.axes, g.col_names):
        labels = ax.get_xticks()
        ax.set_xlabel('')
        ax.set_xticklabels(labels,rotation=90)
        ax.set_title(cname)
    plt.tight_layout(h_pad=1.0)
    plt.savefig(join(figure_dir, '{}_distrib.svg'.format(experiment_id)))

### Inter-feature correlations

The following table shows the Pearson correlations between all the training features
after transformation (if applicable), truncation and standardization. The human score 
(`sc1`) is also included. 

Response length (`length`) is included if 
you specified `length_column` in the config file, unless the column had missing 
values or a standard deviation <= 0. 

The following values are highlighted in <span style="color: red">red</span>:
- inter-feature correlations above 0.7, and
- `sc1`-feature correlations lower than 0.1 or higher than 0.7

In [None]:
df_cors = pd.read_csv(join(output_dir, '{}_cors_processed.csv'.format(experiment_id)), index_col=0)
if 'length' in df_cors.columns:
    feature_columns = sorted([c for c in df_cors.columns if c not in ['sc1', 'length']])
    order = ['sc1', 'length'] + feature_columns
else:
    feature_columns = sorted([c for c in df_cors.columns if c != 'sc1'])
    order = ['sc1'] + feature_columns
df_cors = df_cors.reindex(index=order, columns=order)

# apply two different formatting to the columns according
# to two different thresholds. The first one highlights all
# inter-feature correlations > 0.7 (so, not including sc1)
# and the second highlights all sc1-X correlations lower
# than 0.1 and higher than 0.7. We will use red for the
# first formatting and blue for the second one. 
formatter1 = partial(color_highlighter, low=-1, high=0.7)
formatter2 = partial(color_highlighter, low=0.1, high=0.7)

formatter_dict = {c: formatter1 for c in feature_columns+['length']}
formatter_dict.update({'sc1': formatter2})

HTML(df_cors.to_html(classes=['sortable'], formatters=formatter_dict, escape=False))

### Marginal and partial correlations

The plot below shows correlations between truncated and standardized values of each feature against human score. The first bar (`Marginal`) in each case shows Pearson's correlation. The second bar (`Partial - all`) shows partial correlations after controlling for all other variables. If you specified `length_column` in the config file, a third bar (`Partial - length`) will show partial correlations of each feature against the human score after controlling for length.

In [None]:
# read in and merge the score correlations 
df_margcor = pd.read_csv(join(output_dir, '{}_margcor_score_all_data.csv'.format(experiment_id)), index_col=0)
df_pcor = pd.read_csv(join(output_dir, '{}_pcor_score_all_data.csv'.format(experiment_id)), index_col=0)

# check if we have length partial correlations
pcor_no_length_file = join(output_dir, '{}_pcor_score_no_length_all_data.csv'.format(experiment_id))
with_length = exists(pcor_no_length_file)
if with_length:
    df_pcor_no_length = pd.read_csv(pcor_no_length_file, index_col=0)
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], 
                             df_pcor.loc['All data'], 
                             df_pcor_no_length.loc['All data']]).transpose()
    df_mpcor.columns = ['marginal', 'partial_all', 'partial_length']
    num_entries = 3
    labels = ('Marginal', 'Partial - all', 'Partial - length')

else:
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], 
                             df_pcor.loc['All data']]).transpose()
    df_mpcor.columns = ['marginal', 'partial_all']
    num_entries = 2
    labels = ('Marginal', 'Partial (all)')

df_mpcor.index.name = 'feature'
df_mpcor = df_mpcor.reset_index()
df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])

# we need a higher aspect if we have more than 40 features
aspect = 3 if len(features_used) > 40 else 2

# get the colors for the plot
colors = sns.color_palette("Greys", num_entries)

# check for any negative correlations
limits = (0, 1)
if len(df_mpcor[df_mpcor.value < 0]):
    limits = (-1, 1)

with sns.axes_style('whitegrid'):

    # generate a bar plot but without the legend since we will
    # manually add one later
    p = sns.factorplot("feature", "value", "variable", kind="bar",
                       palette=colors, data=df_mpcor, size=3, 
                       aspect=aspect, legend=False)
    p.set_axis_labels('', 'Correlation with score')
    p.set_xticklabels(rotation=90)
    p.set(ylim=limits)
    
    # add a line at 0.1 and 0.7
    axis = p.axes[0][0]
    axis.axhline(y=0.1, linestyle='--', linewidth=0.5, color='black')
    axis.axhline(y=0.7, linestyle='--', linewidth=0.5, color='black')

    # create the legend manually with the right colors
    legend = axis.legend(labels=labels, title='', frameon=True, 
                         fancybox=True, ncol=num_entries)
    for i in range(num_entries):
        legend.legendHandles[i].set_color(colors[i])
    plt.savefig(join(figure_dir, '{}_cors_score.svg'.format(experiment_id)))

In [None]:
len_margcor_file = join(output_dir, '{}_margcor_length_all_data.csv'.format(experiment_id))
len_pcor_file = join(output_dir, '{}_pcor_length_all_data.csv'.format(experiment_id))
if exists(len_margcor_file) and exists(len_pcor_file):
    display(Markdown("The plot below shows the same correlations between truncated and standardized values of each feature against length.")) 

    df_margcor = pd.read_csv(len_margcor_file, index_col=0)
    df_pcor = pd.read_csv(len_pcor_file, index_col=0)
    df_mpcor = pd.DataFrame([df_margcor.loc['All data'], df_pcor.loc['All data']]).transpose()
    df_mpcor.index.name = 'feature'
    df_mpcor.columns = ['marginal', 'partial']
    df_mpcor = df_mpcor.reset_index()
    df_mpcor = pd.melt(df_mpcor, id_vars=['feature'])

    # we need a higher aspect if we have more than 40 features
    aspect = 3 if len(features_used) > 40 else 2

    # check for any negative correlations
    limits = (0, 1)
    if len(df_mpcor[df_mpcor.value < 0]):
        limits = (-1, 1)

    # get the colors for the plot
    colors = sns.color_palette("Greys", 2)
        
    with sns.axes_style('whitegrid'):
        
        # create a barplot but without the legend since
        # we will manually add one later
        p = sns.factorplot("feature", "value", "variable", kind="bar",
                           palette=colors, data=df_mpcor, size=3, 
                           aspect=aspect, legend=False)
        p.set_axis_labels('', 'Correlation with length')
        p.set_xticklabels(rotation=90)
        p.set(ylim=limits)

        # create the legend manually with the right colors
        axis = p.axes[0][0]
        legend = axis.legend(labels=('Marginal', 'Partial  - all'), title='', 
                             frameon=True, fancybox=True, ncol=2)
        legend.legendHandles[0].set_color(colors[0])
        legend.legendHandles[1].set_color(colors[1])
        plt.savefig(join(figure_dir, '{}_cors_length.svg'.format(experiment_id))) 

In [None]:
consistency_file = join(output_dir, '{}_consistency.csv'.format(experiment_id))
degradation_file = join(output_dir, '{}_degradation.csv'.format(experiment_id))
if exists(consistency_file) and exists(degradation_file):
    df_consistency = pd.read_csv(consistency_file, index_col=0)
    df_degradation = pd.read_csv(degradation_file, index_col=0)
    df_eval = pd.read_csv(join(output_dir, '{}_eval.csv'.format(experiment_id)), index_col=0)
    markdown_strs = ['## Consistency']
    markdown_strs.append('### Human-human agreement')
    markdown_strs.append("This table shows the human-human agreement on the "
                         "double-scored evaluation data. The following are "
                         "highlighted in <span style='color: red'>red</span>: ")
    markdown_strs.append(' - Exact agreement (`exact_agr`) < 50%')
    markdown_strs.append(' - Adjacent agreement (`adj_agr`) < 95%')
    markdown_strs.append(' - Quadratic weighted kappa (`wtkappa`) < 0.7')
    markdown_strs.append(' - Pearson correlation (`corr`) < 0.7')
    display(Markdown('\n'.join(markdown_strs)))
    
    # display the HTML for the table with the various formatters
    formatter_exact_agr = partial(color_highlighter, low=50, high=100)
    formatter_adj_agr = partial(color_highlighter, low=95, high=100)
    formatter_wtkappa_corr = partial(color_highlighter, low=0.7)
    formatter_dict = {'exact_agr': formatter_exact_agr, 
                      'adj_agr': formatter_adj_agr,
                      'wtkappa': formatter_wtkappa_corr, 
                      'corr': formatter_wtkappa_corr}
    display(HTML(df_consistency.to_html(index=False,
                                        escape=False,
                                        float_format=float_format_func,
                                        formatters=formatter_dict)))
    
    markdown_strs = ['### Degradation']
    markdown_strs.append('The next table shows the degradation in the evaluation metrics '
                         '(`diff`) when comparing the machine (`H-M`) to a second human (`H-H`). '
                         'A positive degradation value indicates better human-machine performance. '
                         'Note that the human-machine agreement is computed on the full '
                         'dataset (to get a reliable estimate) whereas the human-human '
                         'agreement is computed on the subset of responses that were double-scored.')
    markdown_strs.append("\nThe following degradation values are highlighted in "
                         "<span style='color: red'>red</span>: ")
    markdown_strs.append(' - `corr` < -0.1')
    markdown_strs.append(' - `wtkappa` < -0.1')
    display(Markdown('\n'.join(markdown_strs)))
    df_eval_for_degradation = df_eval[df_degradation.columns].copy()
    df_consistency_for_degradation = pd.concat([df_consistency]*len(df_eval))
    df_consistency_for_degradation = df_consistency_for_degradation[df_degradation.columns].copy()
    df_consistency_for_degradation.index = df_eval_for_degradation.index

    df_consistency_for_degradation['type'] = 'H-H'
    df_eval_for_degradation['type'] = 'H-M'
    df_degradation['type'] = 'diff'

    df = pd.concat([df_consistency_for_degradation, df_eval_for_degradation, df_degradation])
    df = df[['type','corr', 'kappa', 'wtkappa', 'exact_agr', 'adj_agr', 'SMD']]
    df = df.reset_index()
    df = df.set_index(['index', 'type']).sortlevel('index')
    df.index.names = [None, None]
    
    # display the HTML for the table with the various formatters
    formatter_corr = partial(color_highlighter, low=-0.1, high=100)
    formatter_wtkappa = partial(color_highlighter, low=-0.1, high=100)
    formatter_dict = {'corr': formatter_corr, 'wtkappa': formatter_wtkappa}
    display(HTML(df.to_html(float_format=float_format_func, 
                            formatters=formatter_dict, escape=False)))

## Model

In [None]:
Markdown('Model used: **{}**'.format(model_name))

In [None]:
Markdown('Number of features in model: **{}**'.format(len(features_used)))

In [None]:
builtin_ols_models = ['LinearRegression',
                      'EqualWeightsLR',
                      'RebalancedLR',
                      'NNLR',
                      'LassoFixedLambdaThenNNLR',
                      'LassoFixedLambdaThenLR',
                      'PositiveLassoCVThenLR']

builtin_lasso_models = ['LassoFixedLambda',
                        'PositiveLassoCV']

In [None]:
# we first just show a summary of the OLS model
if model_name in builtin_ols_models:
    display(Markdown('### Model summary'))
    summary_file = join(output_dir, '{}_ols_summary.txt'.format(experiment_id))
    with open(summary_file, 'r') as summf:
        model_summary = summf.read()
        print(model_summary)

### Standardized and Relative Regression Coefficients (Betas)

The relative coefficients are intended to show relative contribution of different feature and their primary purpose is to indentify whether one of the features has an unproportionate effect over the final score. They are computed as standardized/(sum of absolute values of standardized coefficients). 

Negative standardized coefficients are highlighted in <span style="color: red">red</span>.

**Note**: if the model contains negative coefficients, relative values will not sum up to one and their interpretation is generally questionable. 

In [None]:
markdown_str = """
**Note**: The coefficients were estimated using LASSO regression. Unlike OLS (standard) linear regression, lasso estimation is based on an optimization routine and therefore the exact estimates may differ across different systems. """

if model_name in builtin_lasso_models:
    display(Markdown(markdown_str))

In [None]:
df_betas.sort_values(by='feature', inplace=True)
display(HTML(df_betas.to_html(classes=['sortable'], 
                              index=False, 
                              escape=False,
                              float_format=float_format_func,
                              formatters={'standardized': color_highlighter})))

Here are the same values, shown graphically.

In [None]:
df_betas_sorted = df_betas.sort_values(by='standardized', ascending=False)
df_betas_sorted.reset_index(drop=True, inplace=True)
fig = plt.figure()
fig.set_size_inches(8, 3)
fig.subplots_adjust(bottom=0.5)
grey_colors = sns.color_palette('Greys', len(features_used))[::-1]
with sns.axes_style('whitegrid'):
    ax1=fig.add_subplot(121)
    sns.barplot("feature","standardized", data=df_betas_sorted, 
                order=df_betas_sorted['feature'].values,
                palette=sns.color_palette("Greys", 1), ax=ax1)
    ax1.set_xticklabels(df_betas_sorted['feature'].values, rotation=90)
    ax1.set_title('Values of standardized coefficients')
    ax1.set_xlabel('')
    ax1.set_ylabel('')
    # no pie chart if we have more than 15 features
    if len(features_used) <= 15:
        ax2=fig.add_subplot(133, aspect=True)
        ax2.pie(abs(df_betas_sorted['relative'].values), colors=grey_colors, 
            labels=df_betas_sorted['feature'].values)
        ax2.set_title('Proportional contribution of each feature')
    else:
        fig.set_size_inches(0.35*len(features_used), 3)
plt.savefig(join(figure_dir, '{}_betas.svg'.format(experiment_id)))

In [None]:
if model_name in builtin_ols_models:
    display(Markdown('### Model diagnostics'))
    display(Markdown("These are standard plots for model diagnostics for the main model. All information is computed based on the training set."))

In [None]:
# read in the OLS model file and create the diagnostics plots
if model_name in builtin_ols_models:
    ols_file = join(output_dir, '{}.ols'.format(experiment_id))
    model = pickle.load(open(ols_file, 'rb'))
    model_predictions = model.predict()

    with sns.axes_style('white'):
        f, (ax1, ax2) = plt.subplots(1, 2)
        f.set_size_inches((10, 4))
        
        ###
        # for now, we do not show the influence plot since it can be slow to generate
        ###
        # sm.graphics.influence_plot(model.sm_ols, criterion="cooks", size=10, ax=ax1)
        # ax1.set_title('Residuals vs. Leverage', fontsize=16)
        # ax1.set_xlabel('Leverage', fontsize=16)
        # ax1.set_ylabel('Standardized Residuals', fontsize=16)

        sm.qqplot(model.resid, stats.norm, fit=True, line='q', ax=ax1)
        ax1.set_title('Normal Q-Q Plot', fontsize=16)
        ax1.set_xlabel('Theoretical Quantiles', fontsize=16)
        ax1.set_ylabel('Sample Quantiles', fontsize=16)

        ax2.scatter(model_predictions, model.resid)
        ax2.set_xlabel('Fitted values', fontsize=16)
        ax2.set_ylabel('Residuals', fontsize=16)
        ax2.set_title('Residuals vs. Fitted', fontsize=16)

        imgfile = join(figure_dir, '{}_ols_diagnostic_plots.png'.format(experiment_id))
        plt.savefig(imgfile)
        display(Image(imgfile))
        plt.close();    

## Evaluation results

### Association statistics

The table shows the standard association metrics between human scores and different types of machine scores. These results are computed on the evaluation set. `Trim` (`bound`) scores are truncated to [min-0.4998, max+.4998]. `Trim-round` scores are computed by first truncating and then rounding the predicted score. Scaled scores are computed by re-scaling the predicted scores using mean and standard deviation of human scores as observed on the training data and mean and standard deviation of machine scores as predicted for the training set. Note that for the computation of kappas scores are always rounded.

SMD values lower then -0.15 or higher than 0.15 are highlighted in <span style="color: red">red</span>.

In [None]:
raw_or_scaled = "scaled" if use_scaled_predictions else "raw"
df_eval = pd.read_csv(join(output_dir, '{}_eval.csv'.format(experiment_id)), index_col=0)
pd.options.display.width=10
formatter = partial(color_highlighter, low=-0.15, high=0.15)
HTML('<span style="font-size:95%">'+ df_eval.to_html(classes=['sortable'], 
                                                     escape=False,
                                                     formatters={'SMD': formatter},
                                                     float_format=float_format_func) + '</span>')

### Confusion Matrix

In [None]:
Markdown("Confusion matrix using {} trimmed rounded scores (rows=system, columns=human).".format(raw_or_scaled))

In [None]:
df_confmat = pd.read_csv(join(output_dir, '{}_confMatrix.csv'.format(experiment_id)), index_col=0)
df_confmat

### Distribution of human and machine scores

In [None]:
markdown_strs = ["The histogram and the table below show the distibution of "
                 "human scores and {} trimmed rounded machine scores "
                 "(as % of all responses).".format(raw_or_scaled)]
markdown_strs.append("Differences in the table between human and machine distributions "
                     "larger than 5 percentage points are highlighted in <span style='color:red'>red</span>.")
display(Markdown('\n'.join(markdown_strs)))

In [None]:
df_scoredist = pd.read_csv(join(output_dir, '{}_score_dist.csv'.format(experiment_id)), index_col=0)
df_scoredist_melted = pd.melt(df_scoredist, id_vars=['score'])
df_scoredist_melted = df_scoredist_melted[df_scoredist_melted['variable'] != 'difference']

# get the colors for the plot
colors = sns.color_palette("Greys", 2)

with sns.axes_style('whitegrid'):

    # make a barplot without a legend since we will 
    # add one manually later
    p = sns.factorplot("score", "value", "variable", kind="bar",
                       palette=colors, data=df_scoredist_melted, 
                       size=3, aspect=2, legend=False)
    p.set_axis_labels('score', '% of responses')
    
    # add a legend with the right colors
    axis = p.axes[0][0]
    legend = axis.legend(labels=('Human', 'Machine'), title='', frameon=True, fancybox=True)
    legend.legendHandles[0].set_color(colors[0])
    legend.legendHandles[1].set_color(colors[1])
    
    plt.savefig(join(figure_dir, '{}_score_dist.svg'.format(experiment_id)))

In [None]:
formatter = partial(color_highlighter, low=0, high=5, absolute=True)
df_html = df_scoredist.to_html(classes=['sortable'], index=False, 
                               escape=False, formatters={'difference': formatter})
display(HTML(df_html))

## Principal component analysis

PCA using scaled data and singular value decomposition. This is computed using processed features after the truncation of outliers and other transformations specified in feature config file.

In [None]:
df_pca = pd.read_csv(join(output_dir, '{}_pca.csv'.format(experiment_id)), index_col=0)
df_pca.sort_index(inplace=True)
HTML(df_pca.to_html(classes=['sortable'], float_format=float_format_func))

In [None]:
df_pcavar = pd.read_csv(join(output_dir, '{}_pcavar.csv'.format(experiment_id)), index_col=0)
df_pcavar.sort_index(inplace=True)
HTML(df_pcavar.to_html(classes=['sortable'], float_format=float_format_func))

In [None]:
# generate the Scree plot
with sns.axes_style('white'):
    num_components = len(df_pcavar.columns)
    labels = list(df_pcavar.columns)
    ax = df_pcavar.transpose().plot(y='Eigenvalues', kind='line', 
                                    color='black', linestyle='dashed', marker='o', legend=False,
                                    linewidth=1, use_index=True, xticks=range(num_components),
                                    figsize=(11, 5), title='Scree Plot: Principal Component Analysis')
    ax.set_ylabel('Variances')
    ax.set_xticklabels(labels, rotation=90)
    plt.savefig(join(figure_dir, '{}_pca.svg'.format(experiment_id)))

## System information

In [None]:
system_name = platform.system()

# People might not know what 'Darwin' is, so we should replace that with 'Mac OS X'
if system_name == 'Darwin':
    system_name = 'Mac OS X'
    
# get the architecture
architecture = platform.architecture()[0]

# get the rsmtool version
rsmtool_version_str = '.'.join(map(str, rsmtool_version))

print('This report was generated using rsmtool v{} on a {} computer running {}.'.format(rsmtool_version_str, 
                                                                                        architecture, 
                                                                                        system_name))

### Python packages

In [None]:
import pip
sorted(["%s==%s" % (i.key, i.version) for i in pip.get_installed_distributions()])

In [None]:
%%javascript

// Code to dynamically generate table of contents at the top of the HTML file
var tocEntries = ['<ul>'];
var anchors = $('a.anchor-link');
var headingTypes = $(anchors).parent().map(function() { return $(this).prop('tagName')});
var headingTexts = $(anchors).parent().map(function() { return $(this).text()});
var subList = false;

$.each(anchors, function(i, anch) {
    var hType = headingTypes[i];
    var hText = headingTexts[i];
    hText = hText.substr(0, hText.length - 1);
    if (hType == 'H2') {
        if (subList) {
            tocEntries.push('</ul>')
            subList = false;
        }
        tocEntries.push('<li><a href="' + anch + '"</a>' + hText + '</li>')
    }
    else if (hType == 'H3') {
        if (!subList) {
            subList = true;
            tocEntries.push('<ul>')
        }
        tocEntries.push('<li><a href="' + anch + '"</a>' + hText + '</li>')
    }
});
tocEntries.push('</ul>')
$('#toc').html(tocEntries.join(' '))