## Overall descriptive feature statistics

These values are reported before transformations.

In [None]:
# feature descriptives table
desc_file = join(output_dir, '{}_feature_descriptives.{}'.format(experiment_id, file_format))

df_desc = DataReader.read_from_file(desc_file, index_col=0)
HTML(df_desc.to_html(classes=['sortable'], float_format=float_format_func))

### Prevalence of recoded cases

This sections shows the number and percentage of cases truncated to mean +/- 4 SD for each feature.

In [None]:
outliers_file = join(output_dir, '{}_feature_outliers.{}'.format(experiment_id, file_format))
df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
df_outliers.index.name = 'feature'
df_outliers = df_outliers.reset_index()
df_outliers = pd.melt(df_outliers, id_vars=['feature'])
df_outliers = df_outliers[df_outliers.variable.str.contains(r'[ulb].*?perc')]


# we need to increase the plot height if feature names are long
if longest_feature_name > 10:
    height = 3 + math.ceil((longest_feature_name - 10)/10)
else:
    height = 3
    
# we also need a higher aspect if we have more than 40 features
# The aspect defines the final width of the plot (width=aspect*height).
# We keep the width constant (9 for plots with many features or 6
# for plots with few features) by dividing the expected width
# by the height. 
aspect = 9/height if len(features_used) > 40 else 6/height


# colors for the plot
colors = sns.color_palette("Greys", 3)

# what's the largest value in the data frame
maxperc = df_outliers['value'].max()

# compute the limits for the graph
limits = (0, max(2.5, maxperc))

with sns.axes_style('whitegrid'):
    # create a barplot without a legend since we will manually
    # add one later
    p = sns.catplot(x="feature", y="value", hue="variable", kind="bar", 
                    palette=colors, data=df_outliers, height=height, 
                    aspect=aspect, legend=False)
    p.set_axis_labels('', '% cases truncated\nto mean +/- 4*sd')
    p.set_xticklabels(rotation=90)
    p.set(ylim=limits)

    # add a line at 2%
    axis = p.axes[0][0]
    axis.axhline(y=2.0, linestyle='--', linewidth=1.5, color='black')

    # add a legend with the right colors
    legend=axis.legend(('both', 'lower', 'upper'), title='', frameon=True, fancybox=True, ncol=3)
    legend.legendHandles[0].set_color(colors[0])
    legend.legendHandles[1].set_color(colors[1])

    # we want to try to force `tight_layout()`, but if this 
    # raises a warning, we don't want the entire notebook to fail
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        plt.tight_layout(h_pad=1.0)

    imgfile = join(figure_dir, '{}_outliers.svg'.format(experiment_id))
    plt.savefig(imgfile)
    if use_thumbnails:
        show_thumbnail(imgfile, next(id_generator))
    else:
        plt.show()

### Feature value distribution

The following table shows additional statistics for the data. Quantiles are computed using type=3 method used in SAS. The mild outliers are defined as data points between [1.5, 3) \* IQR away from the nearest quartile. Extreme outliers are the data points >= 3 * IQR away from the nearest quartile.

In [None]:
# feature descriptives extra table
desce_file = join(output_dir, '{}_feature_descriptivesExtra.{}'.format(experiment_id,
                                                                       file_format))
df_desce = DataReader.read_from_file(desce_file, index_col=0)
HTML(df_desce.to_html(classes=['sortable'], float_format=float_format_func))