Skip to content

Commit

Permalink
Merge pull request #243 from EducationalTestingService/bugfix/fix-box…
Browse files Browse the repository at this point in the history
…plots-in-features-by-groups

Fix boxplots in features by groups
  • Loading branch information
jbiggsets committed Dec 10, 2018
2 parents af32ade + 1ef92ff commit 765ee68
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 59 deletions.
10 changes: 8 additions & 2 deletions rsmtool/notebooks/feature_descriptives.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,13 @@
" legend.legendHandles[0].set_color(colors[0])\n",
" legend.legendHandles[1].set_color(colors[1])\n",
"\n",
" plt.tight_layout(h_pad=1.0)\n",
" # we want to try to force `tight_layout()`, but if this fails\n",
" # we don't want the entire notebook to fail\n",
" try:\n",
" plt.tight_layout(h_pad=1.0)\n",
" except ValueError:\n",
" pass\n",
"\n",
" imgfile = join(figure_dir, '{}_outliers.svg'.format(experiment_id))\n",
" plt.savefig(imgfile)\n",
" if use_thumbnails:\n",
Expand Down Expand Up @@ -139,7 +145,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
"version": "3.6.7"
}
},
"nbformat": 4,
Expand Down
109 changes: 52 additions & 57 deletions rsmtool/notebooks/features_by_group.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,70 +19,65 @@
"metadata": {},
"outputs": [],
"source": [
"for group in groups_desc:\n",
" display(Markdown('### Feature values by {}'.format(group)))\n",
" display(Markdown('In all plots in this subsection the values are reported before transformations/truncation. The lines indicate the threshold for truncation (mean +/- 4*SD)'))\n",
" if len(features_used) > 20:\n",
" display(Markdown('Since the data has more than 20 features a low-resolution plot is generated to conserve memory'))\n",
"if len(features_used) > 30:\n",
" display(Markdown('Since the data has more than 30 features, boxplots with feature values for all groups '\n",
" 'will be skipped. This experiment currently has {} features.'.format(len(features_used))))\n",
"else:\n",
"\n",
" df_train_feats = df_train_merged[features_used + [group]]\n",
" for group in groups_desc:\n",
" display(Markdown('### Feature values by {}'.format(group)))\n",
" display(Markdown('In all plots in this subsection the values are reported before '\n",
" 'transformations/truncation. The lines indicate the threshold for '\n",
" 'truncation (mean +/- 4*SD)'))\n",
"\n",
" df_train_feats_all = df_train_merged.copy()\n",
" df_train_feats_all[group] = 'All data'\n",
" df_train_feats = df_train_merged[features_used + [group]]\n",
"\n",
" df_train_combined = pd.concat([df_train_feats, df_train_feats_all])\n",
" df_train_combined.reset_index(drop=True, inplace=True)\n",
" df_train_feats_all = df_train_merged.copy()\n",
" df_train_feats_all[group] = 'All data'\n",
"\n",
" # decide on the the height per plot\n",
" num_features = len(features_used)\n",
" df_train_combined = pd.concat([df_train_feats, df_train_feats_all])\n",
" df_train_combined.reset_index(drop=True, inplace=True)\n",
"\n",
" # decide on the the height per plot\n",
" num_features = len(features_used)\n",
" \n",
" # Define the order of the boxes: put 'All data' first and 'No info' last.\n",
" group_levels = sorted(list(df_train_feats[group].unique()))\n",
" if 'No info' in group_levels:\n",
" box_names = ['All data'] + [level for level in group_levels if level != 'No info'] + ['No info']\n",
" else:\n",
" box_names = ['All data'] + group_levels\n",
" # Define the order of the boxes: put 'All data' first and 'No info' last.\n",
" group_levels = sorted(list(df_train_feats[group].unique()))\n",
" if 'No info' in group_levels:\n",
" box_names = ['All data'] + [level for level in group_levels if level != 'No info'] + ['No info']\n",
" else:\n",
" box_names = ['All data'] + group_levels\n",
"\n",
" # create the faceted boxplots\n",
" fig = plt.figure()\n",
" (figure_width, \n",
" figure_height, \n",
" num_rows, \n",
" num_columns, \n",
" wrapped_box_names) = compute_subgroup_plot_params(box_names, num_features)\n",
"\n",
" # create the faceted boxplots\n",
" fig = plt.figure()\n",
" (figure_width, \n",
" figure_height, \n",
" num_rows, \n",
" num_columns, \n",
" wrapped_box_names) = compute_subgroup_plot_params(box_names, num_features)\n",
" fig.set_size_inches(figure_width, figure_height)\n",
" with sns.axes_style('white'), sns.plotting_context('notebook', font_scale=1.2):\n",
" for i, varname in enumerate(sorted(features_used)):\n",
" df_plot = df_train_combined[[group, varname]]\n",
" min_value = df_plot.mean() - 4 * df_plot.std()\n",
" max_value = df_plot.mean() + 4 * df_plot.std()\n",
" ax = fig.add_subplot(num_rows, num_columns, i + 1)\n",
" ax.axhline(y=float(min_value), linestyle='--', linewidth=0.5, color='r')\n",
" ax.axhline(y=float(max_value), linestyle='--', linewidth=0.5, color='r')\n",
" sns.boxplot(x=df_plot[group], y=df_plot[varname], color='#b3b3b3', ax=ax, order=box_names)\n",
" ax.set_xticklabels(wrapped_box_names, rotation=90) \n",
" ax.set_xlabel('')\n",
" ax.set_ylabel('')\n",
" ax.set_title('{} by {}'.format(varname, group))\n",
" fig.set_size_inches(figure_width, figure_height)\n",
" with sns.axes_style('white'), sns.plotting_context('notebook', font_scale=1.2):\n",
" for i, varname in enumerate(sorted(features_used)):\n",
" df_plot = df_train_combined[[group, varname]]\n",
" min_value = df_plot[varname].mean() - 4 * df_plot[varname].std()\n",
" max_value = df_plot[varname].mean() + 4 * df_plot[varname].std()\n",
" ax = fig.add_subplot(num_rows, num_columns, i + 1)\n",
" ax.axhline(y=float(min_value), linestyle='--', linewidth=0.5, color='r')\n",
" ax.axhline(y=float(max_value), linestyle='--', linewidth=0.5, color='r')\n",
" sns.boxplot(x=df_plot[group], y=df_plot[varname], color='#b3b3b3', ax=ax, order=box_names)\n",
" ax.set_xticklabels(wrapped_box_names, rotation=90) \n",
" ax.set_xlabel('')\n",
" ax.set_ylabel('')\n",
" ax.set_title('{} by {}'.format(varname, group))\n",
"\n",
" plt.tight_layout(h_pad=1.0)\n",
" plt.tight_layout(h_pad=1.0)\n",
"\n",
" # if there are > 20 features, then save the \n",
" # figure as a low resolution .png file and display\n",
" # that instead of rendering and saving the SVG file.\n",
" if len(features_used) > 20:\n",
" image_extension = 'png'\n",
" else:\n",
" image_extension = 'svg'\n",
" imgfile = join(figure_dir, '{}_feature_boxplot_by_{}.{}'.format(experiment_id, group,\n",
" image_extension))\n",
" plt.savefig(imgfile)\n",
" if use_thumbnails:\n",
" show_thumbnail(imgfile, next(id_generator))\n",
" else:\n",
" if len(features_used) > 20:\n",
" display(Image(imgfile))\n",
" plt.close()\n",
" # save the figure as an SVG file.\n",
" imgfile = join(figure_dir, '{}_feature_boxplot_by_{}.svg'.format(experiment_id, group))\n",
" plt.savefig(imgfile)\n",
" if use_thumbnails:\n",
" show_thumbnail(imgfile, next(id_generator))\n",
" else:\n",
" # needed so that the figures are shown after the heading and not at the end of the cell\n",
" plt.show()"
Expand All @@ -106,7 +101,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
"version": "3.6.7"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 765ee68

Please sign in to comment.