In [None]:
# This notebook generates boxplot with feature values for all groups specified in groups_desc variable. 
# Note that this analysis uses the raw feature values after filtering out the missing features and scores.

# merge metadata and feature values
df_train_merged = pd.merge(df_train, df_train_metadata, on = 'spkitemid')

In [None]:
num_features = len(features_used)

if (num_features > 150):
    display(Markdown('### Feature values by subgroup(s)'))
    display(Markdown('Since the data has {} (> 150) features, boxplots with feature values for all groups '
                     'will be skipped.'.format(num_features)))
elif (30 < num_features <= 150 and not use_thumbnails):
    display(Markdown('### Feature values by subgroup(s)'))
    display(Markdown('Since the data has {} (> 30 but <= 150) features, you need to set `"use_thumbnails"` to `true` in your '
                     'configuration file to generate boxplots with feature values for all groups.'.format(num_features)))
else:
    for group in groups_desc:
        display(Markdown('### Feature values by {}'.format(group)))
        display(Markdown('In all plots in this subsection the values are reported before '
                         'transformations/truncation. The lines indicate the threshold for '
                         'truncation (mean +/- 4*SD).'))

        df_train_feats = df_train_merged[features_used + [group]]
        
        # if we have threshold set for this group, filter the data now
        if group in min_n_per_group:
            display(Markdown("The report only shows the results for groups with "
                             "at least {} responses in the training set.".format(min_n_per_group[group])))

            category_counts = df_train_merged[group].value_counts()
            selected_categories = category_counts[category_counts >= min_n_per_group[group]].index

            df_train_feats_all = df_train_merged[df_train_merged[group].isin(selected_categories)].copy()
        else:
        
            df_train_feats_all = df_train_merged.copy()
        
        if len(df_train_feats_all) > 0:
        
            df_train_feats_all[group] = 'All data'

            df_train_combined = pd.concat([df_train_feats, df_train_feats_all], sort=True)
            df_train_combined.reset_index(drop=True, inplace=True)

            # Define the order of the boxes: put 'All data' first and 'No info' last.
            group_levels = sorted(list(df_train_feats[group].unique()))
            if 'No info' in group_levels:
                box_names = ['All data'] + [level for level in group_levels if level != 'No info'] + ['No info']
            else:
                box_names = ['All data'] + group_levels

            # create the faceted boxplots
            fig = plt.figure()
            (figure_width, 
             figure_height, 
             num_rows, 
             num_columns, 
             wrapped_box_names) = compute_subgroup_plot_params(box_names, num_features)

            fig.set_size_inches(figure_width, figure_height)
            with sns.axes_style('white'), sns.plotting_context('notebook', font_scale=1.2):
                for i, varname in enumerate(sorted(features_used)):
                    df_plot = df_train_combined[[group, varname]]
                    min_value = df_plot[varname].mean() - 4 * df_plot[varname].std()
                    max_value = df_plot[varname].mean() + 4 * df_plot[varname].std()
                    ax = fig.add_subplot(num_rows, num_columns, i + 1)
                    ax.axhline(y=float(min_value), linestyle='--', linewidth=0.5, color='r')
                    ax.axhline(y=float(max_value), linestyle='--', linewidth=0.5, color='r')
                    sns.boxplot(x=df_plot[group], y=df_plot[varname], color='#b3b3b3', ax=ax, order=box_names)
                    ax.set_xticklabels(wrapped_box_names, rotation=90) 
                    ax.set_xlabel('')
                    ax.set_ylabel('')
                    plot_title = '{} by {}'.format('\n'.join(wrap(varname, 30)), group)
                    ax.set_title(plot_title)

            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                plt.tight_layout(h_pad=1.0)


            # save the figure as an SVG file.
            imgfile = join(figure_dir, '{}_feature_boxplot_by_{}.svg'.format(experiment_id, group))
            plt.savefig(imgfile)
            if use_thumbnails:
                show_thumbnail(imgfile, next(id_generator))
            else:
                # needed so that the figures are shown after the heading and not at the end of the cell
                plt.show()
        else:
            display(Markdown("None of the groups in {} had {} or more responses.".format(group,
                                                                                         min_n_per_group[group])))