In [None]:
# Setting options for the plots
%matplotlib inline
%config InlineBackend.figure_formats={'retina', 'svg'}
%config InlineBackend.rc={'savefig.dpi': 150}

# Experiment Report 

In [None]:
import itertools
import os
import re
import pickle
import platform
import time

from functools import partial
from os.path import abspath, exists, join

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from matplotlib import pyplot as plt

from IPython import sys_info
from IPython.display import display, HTML, Image, Javascript, Markdown, SVG

from rsmtool.utils import (float_format_func,
                           int_or_float_format_func,
                           bold_highlighter,
                           color_highlighter)

from rsmtool.version import VERSION as rsmtool_version

<style type="text/css">
  div.prompt.output_prompt { 
    color: white; 
  }
  
  span.highlight_color {
    color: red;
  }
  
  span.highlight_bold {
    font-weight: bold;  
  }
    
  @media print {
    @page {
      size: landscape;
      margin: 0cm 0cm 0cm 0cm;
    }

    * {
      margin: 0px;
      padding: 0px;
    }

    #toc {
      display: none;
    }

    span.highlight_color, span.highlight_bold {
        font-weight: bolder;
        text-decoration: underline;
    }

    div.prompt.output_prompt {
      display: none;
    }
    
    h3#Python-packages, div#packages {
      display: none;
  }
</style>

In [None]:
# NOTE: you will need to set the following manually
# if you are using this notebook interactively.
experiment_id = os.environ.get('EXPERIMENT_ID')
description = os.environ.get('DESCRIPTION')
context = os.environ.get('CONTEXT')
train_file_location = os.environ.get('TRAIN_FILE_LOCATION')
test_file_location = os.environ.get('TEST_FILE_LOCATION')
output_dir = os.environ.get('OUTPUT_DIR')
figure_dir = os.environ.get('FIGURE_DIR')
model_name = os.environ.get('MODEL_NAME')
model_type = os.environ.get('MODEL_TYPE')
length_column = os.environ.get('LENGTH_COLUMN')
second_human_score_column = os.environ.get('H2_COLUMN')
scaled = os.environ.get('SCALED')
use_scaled_predictions = scaled == '1'
exclude_zero_scores = os.environ.get('EXCLUDE_ZEROS') == '1'
feature_subset_file = os.environ.get('FEATURE_SUBSET_FILE')
min_items = int(os.environ.get('MIN_ITEMS'))

# groups for analysis by prompt or subgroup.
# set to 'prompt' for the standard analysis of 'prompt%%subgroup1%%subgroup2' for subgroup analysis.
groups_desc_string = os.environ.get('GROUPS_FOR_DESCRIPTIVES') 
groups_desc = groups_desc_string.split('%%')
groups_eval_string = os.environ.get('GROUPS_FOR_EVALUATIONS') 
groups_eval = groups_eval_string.split('%%')

# javascript path
javascript_path = os.environ.get("JAVASCRIPT_PATH")

In [None]:
display(Javascript(filename=join(javascript_path, "sort.js")))

In [None]:
Markdown('''This report presents the analysis for **{}**: {}'''.format(experiment_id, description))

In [None]:
HTML(time.strftime('%c'))

In [None]:
%%html
<div id="toc"></div>

In [None]:
# Read in the training and testing features, both raw and pre-processed
# Make sure that the `spkitemid` column is read as a string

if exists(train_file_location):
    df_train_orig = pd.read_csv(train_file_location)

train_file = join(output_dir, '{}_train_features.csv'.format(experiment_id))
if exists(train_file):
    df_train = pd.read_csv(train_file, converters={'spkitemid': str})
    
train_metadata_file = join(output_dir, '{}_train_metadata.csv'.format(experiment_id))    
if exists(train_metadata_file):
    df_train_metadata = pd.read_csv(train_metadata_file, converters={'spkitemid': str})

train_other_columns_file = join(output_dir, '{}_train_other_columns.csv'.format(experiment_id))
if exists(train_other_columns_file):
    df_train_other_columns = pd.read_csv(train_other_columns_file, converters={'spkitemid': str})

train_length_file = join(output_dir, '{}_train_response_lengths.csv'.format(experiment_id))
if exists(train_length_file):
    df_train_length = pd.read_csv(train_length_file, converters={'spkitemid': str})
    
train_excluded_file = join(output_dir, '{}_train_excluded_responses.csv'.format(experiment_id))
if exists(train_excluded_file):
    df_train_excluded = pd.read_csv(train_excluded_file, converters={'spkitemid': str})
    
train_responses_with_excluded_flags_file = join(output_dir, '{}_train_responses_with_excluded_flags.csv'.format(experiment_id))
if exists(train_responses_with_excluded_flags_file):
    df_train_responses_with_excluded_flags = pd.read_csv(train_responses_with_excluded_flags_file, converters={'spkitemid': str})
    
train_preproc_file = join(output_dir, '{}_train_preprocessed_features.csv'.format(experiment_id))    
if exists(train_preproc_file):
    df_train_preproc = pd.read_csv(train_preproc_file, converters={'spkitemid': str})
    
if exists(test_file_location):
    df_test_orig = pd.read_csv(test_file_location)

test_file = join(output_dir, '{}_test_features.csv'.format(experiment_id))
if exists(test_file):
    df_test = pd.read_csv(test_file, converters={'spkitemid': str})

test_metadata_file = join(output_dir, '{}_test_metadata.csv'.format(experiment_id))    
if exists(test_metadata_file):
    df_test_metadata = pd.read_csv(test_metadata_file, converters={'spkitemid': str})
    
test_other_columns_file = join(output_dir, '{}_test_other_columns.csv'.format(experiment_id))
if exists(test_other_columns_file):
    df_test_other_columns = pd.read_csv(test_other_columns_file, converters={'spkitemid': str})

test_human_scores_file = join(output_dir, '{}_test_human_scores.csv'.format(experiment_id))
if exists(test_human_scores_file):
    df_test_human_scores = pd.read_csv(test_human_scores_file, converters={'spkitemid': str})
        
test_excluded_file = join(output_dir, '{}_test_excluded_responses.csv'.format(experiment_id))
if exists(test_excluded_file):
    df_test_excluded = pd.read_csv(test_excluded_file, converters={'spkitemid': str})
    
test_responses_with_excluded_flags_file = join(output_dir, '{}_test_responses_with_excluded_flags.csv'.format(experiment_id))
if exists(test_responses_with_excluded_flags_file):
    df_test_responses_with_excluded_flags = pd.read_csv(test_responses_with_excluded_flags_file, converters={'spkitemid': str})

test_preproc_file = join(output_dir, '{}_test_preprocessed_features.csv'.format(experiment_id))
if exists(test_preproc_file):
    df_test_preproc = pd.read_csv(test_preproc_file, converters={'spkitemid': str})

pred_preproc_file = join(output_dir, '{}_pred_processed.csv'.format(experiment_id))
if exists(pred_preproc_file):
    df_pred_preproc = pd.read_csv(pred_preproc_file, converters={'spkitemid': str})

feature_file = join(output_dir, '{}_feature.csv'.format(experiment_id))
if exists(feature_file):
    df_features = pd.read_csv(feature_file, converters={'spkitemid': str})
    features_used = [c for c in df_features.feature.values]
    
betas_file = join(output_dir, '{}_betas.csv'.format(experiment_id))
if exists(betas_file):
    df_betas = pd.read_csv(betas_file)
    
if exists(feature_subset_file):
    df_feature_subset_specs = pd.read_csv(feature_subset_file)
else:
    df_feature_subset_specs = None

