In [None]:
# This notebook generates boxplot with feature values for all groups specified in groups_desc variable. 
# Note that this analysis uses the raw feature values after filtering out the missing features and scores.

# merge metadata and feature values
df_train_merged = pd.merge(df_train, df_train_metadata, on = 'spkitemid')

In [None]:
if len(features_used) > 30:
    display(Markdown('Since the data has more than 30 features, boxplots with feature values for all groups '
                     'will be skipped. This experiment currently has {} features.'.format(len(features_used))))
else:

    for group in groups_desc:
        display(Markdown('### Feature values by {}'.format(group)))
        display(Markdown('In all plots in this subsection the values are reported before '
                         'transformations/truncation. The lines indicate the threshold for '
                         'truncation (mean +/- 4*SD)'))

        df_train_feats = df_train_merged[features_used + [group]]

        df_train_feats_all = df_train_merged.copy()
        df_train_feats_all[group] = 'All data'

        df_train_combined = pd.concat([df_train_feats, df_train_feats_all], sort=True)
        df_train_combined.reset_index(drop=True, inplace=True)

        # decide on the the height per plot
        num_features = len(features_used)
    
        # Define the order of the boxes: put 'All data' first and 'No info' last.
        group_levels = sorted(list(df_train_feats[group].unique()))
        if 'No info' in group_levels:
            box_names = ['All data'] + [level for level in group_levels if level != 'No info'] + ['No info']
        else:
            box_names = ['All data'] + group_levels

        # create the faceted boxplots
        fig = plt.figure()
        (figure_width, 
         figure_height, 
         num_rows, 
         num_columns, 
         wrapped_box_names) = compute_subgroup_plot_params(box_names, num_features)

        fig.set_size_inches(figure_width, figure_height)
        with sns.axes_style('white'), sns.plotting_context('notebook', font_scale=1.2):
            for i, varname in enumerate(sorted(features_used)):
                df_plot = df_train_combined[[group, varname]]
                min_value = df_plot[varname].mean() - 4 * df_plot[varname].std()
                max_value = df_plot[varname].mean() + 4 * df_plot[varname].std()
                ax = fig.add_subplot(num_rows, num_columns, i + 1)
                ax.axhline(y=float(min_value), linestyle='--', linewidth=0.5, color='r')
                ax.axhline(y=float(max_value), linestyle='--', linewidth=0.5, color='r')
                sns.boxplot(x=df_plot[group], y=df_plot[varname], color='#b3b3b3', ax=ax, order=box_names)
                ax.set_xticklabels(wrapped_box_names, rotation=90) 
                ax.set_xlabel('')
                ax.set_ylabel('')
                ax.set_title('{} by {}'.format(varname, group))

        plt.tight_layout(h_pad=1.0)

        # save the figure as an SVG file.
        imgfile = join(figure_dir, '{}_feature_boxplot_by_{}.svg'.format(experiment_id, group))
        plt.savefig(imgfile)
        if use_thumbnails:
            show_thumbnail(imgfile, next(id_generator))
        else:
            # needed so that the figures are shown after the heading and not at the end of the cell
            plt.show()