Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User-specified medians and conf. intervals in boxplots #906

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c96118f
users can specify the median and it's confidence interval when creati…
phobson May 27, 2012
6491ce0
added assert messages to help user
phobson May 27, 2012
4328635
fixed embarrassing tabs vs 4spaces
phobson May 29, 2012
1767e28
fixed bad indent on 1 line in axes.boxplot
phobson May 29, 2012
a0d30f5
formatted (indented) the docstring os axes.boxplot
phobson May 30, 2012
146a6e2
weird text issue - accidentally copied text from other buffer
phobson May 30, 2012
9efe05a
tried to standardized arg/kwarg doc format in axes.boxplot
phobson May 30, 2012
ae69ae1
added in "function arguments" header
phobson May 30, 2012
45711e5
modified an example to include the new functionality
phobson Jun 2, 2012
ba8890a
minor tweaks to clean up my boxplot example and the logic handling th…
phobson Jul 15, 2012
88d50ce
Got rid of a lot cruft in the example. the transform line wasn't doin…
phobson Jul 15, 2012
1c79eb8
no more subplot adjusting
phobson Jul 15, 2012
5464e34
added test for new boxplot functionality
phobson Jul 15, 2012
ffc8345
fixed my data construction with np.hstack
phobson Jul 15, 2012
d3b3d5c
np.hstack/np.linspace errors
phobson Jul 15, 2012
e6403b2
added baseline images and unit test for new boxplot functionality
phobson Jul 15, 2012
c052d07
switched notch and vert kwargs over to True/False instead of 0 or 1
phobson Jul 15, 2012
3db645a
switched from assert statements to raiseing value errors per ben root…
phobson Jul 21, 2012
e157dd1
cleaned up ambiguous indentation around a multiline if statement
phobson Jul 22, 2012
3172ef2
added entry to what's new
phobson Jul 22, 2012
0b65488
reran boilerplate.py to update pyplot
phobson Jul 23, 2012
a2085a6
fixed remaining conflict in test_axes.py
phobson Jul 24, 2012
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/users/whats_new.rst
Expand Up @@ -57,6 +57,16 @@ minimum and maximum colorbar extensions.

plt.show()

New Boxplot Functionality
-------------------------

Users can now incorporate their own methods for computing the median and its
confidence intervals into the boxplot method. For every column of data passed
to boxplot, the user can specify an accompanying median and confidence
interval.
:meth: `matplotlib.axes.boxplot`
.. plot:: examples/pylab_examples/boxplot_demo3.py

.. _whats-new-1-1:

new in matplotlib-1.1
Expand Down
42 changes: 32 additions & 10 deletions examples/pylab_examples/boxplot_demo3.py
Expand Up @@ -2,26 +2,48 @@
import matplotlib.transforms as mtransforms
import numpy as np

def fakeBootStrapper(n):
'''
This is just a placeholder for the user's method of
bootstrapping the median and its confidence intervals.

Returns an arbitrary median and confidence intervals
packed into a tuple
'''
if n == 1:
med = 0.1
CI = (-0.25, 0.25)
else:
med = 0.2
CI = (-0.35, 0.50)

return med, CI



np.random.seed(2)
inc = 0.1
e1 = np.random.uniform(0,1, size=(500,))
e2 = np.random.uniform(0,1, size=(500,))
e3 = np.random.uniform(0,1 + inc, size=(500,))
e4 = np.random.uniform(0,1 + 2*inc, size=(500,))
e1 = np.random.normal(0, 1, size=(500,))
e2 = np.random.normal(0, 1, size=(500,))
e3 = np.random.normal(0, 1 + inc, size=(500,))
e4 = np.random.normal(0, 1 + 2*inc, size=(500,))

treatments = [e1,e2,e3,e4]
med1, CI1 = fakeBootStrapper(1)
med2, CI2 = fakeBootStrapper(2)
medians = [None, None, med1, med2]
conf_intervals = [None, None, CI1, CI2]

fig = plt.figure()
ax = fig.add_subplot(111)
pos = np.array(range(len(treatments)))+1
bp = ax.boxplot( treatments, sym='k+', patch_artist=True,
positions=pos, notch=1, bootstrap=5000 )
text_transform= mtransforms.blended_transform_factory(ax.transData,
ax.transAxes)
bp = ax.boxplot(treatments, sym='k+', positions=pos,
notch=1, bootstrap=5000,
usermedians=medians,
conf_intervals=conf_intervals)

ax.set_xlabel('treatment')
ax.set_ylabel('response')
ax.set_ylim(-0.2, 1.4)
plt.setp(bp['whiskers'], color='k', linestyle='-' )
plt.setp(bp['fliers'], markersize=3.0)
fig.subplots_adjust(right=0.99,top=0.99)
plt.show()
232 changes: 152 additions & 80 deletions lib/matplotlib/axes.py
Expand Up @@ -35,7 +35,6 @@
import matplotlib.ticker as mticker
import matplotlib.transforms as mtransforms
import matplotlib.tri as mtri

from matplotlib.container import BarContainer, ErrorbarContainer, StemContainer

iterable = cbook.iterable
Expand Down Expand Up @@ -5469,14 +5468,15 @@ def xywhere(xs, ys, mask):

return errorbar_container # (l0, caplines, barcols)

def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,
def boxplot(self, x, notch=False, sym='b+', vert=True, whis=1.5,
positions=None, widths=None, patch_artist=False,
bootstrap=None):
bootstrap=None, usermedians=None, conf_intervals=None):
"""
Call signature::

boxplot(x, notch=0, sym='+', vert=1, whis=1.5,
positions=None, widths=None, patch_artist=False)
boxplot(x, notch=False, sym='+', vert=True, whis=1.5,
positions=None, widths=None, patch_artist=False,
bootstrap=None, usermedians=None, conf_intervals=None)

Make a box and whisker plot for each column of *x* or each
vector in sequence *x*. The box extends from the lower to
Expand All @@ -5489,59 +5489,110 @@ def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,
*x* :
Array or a sequence of vectors.

*notch* : [ 0 (default) | 1]
If 0, produce a rectangular box plot.
If 1, produce a notched box plot
*notch* : [ False (default) | True ]
If False (default), produces a rectangular box plot.
If True, will produce a notched box plot

*sym* :
(default 'b+') is the default symbol for flier points.
*sym* : [ default 'b+' ]
The default symbol for flier points.
Enter an empty string ('') if you don't want to show fliers.

*vert* : [1 (default) | 0]
If 1, make the boxes vertical.
If 0, make horizontal boxes. (Odd, but kept for compatibility
with MATLAB boxplots)
*vert* : [ False | True (default) ]
If True (default), makes the boxes vertical.
If False, makes horizontal boxes.

*whis* : (default 1.5)
Defines the length of the whiskers as
a function of the inner quartile range. They extend to the
most extreme data point within ( ``whis*(75%-25%)`` ) data range.
*whis* : [ default 1.5 ]
Defines the length of the whiskers as a function of the inner
quartile range. They extend to the most extreme data point
within ( ``whis*(75%-25%)`` ) data range.

*bootstrap* : [ *None* (default) | integer ]
Specifies whether to bootstrap the confidence intervals
around the median for notched boxplots. If *None*, no
bootstrapping is performed, and notches are calculated
using a Gaussian-based asymptotic approximation
(see McGill, R., Tukey, J.W., and Larsen, W.A.,
1978, and Kendall and Stuart, 1967). Otherwise, bootstrap
specifies the number of times to bootstrap the median to
determine its 95% confidence intervals. Values between 1000
and 10000 are recommended.

*positions* : (default 1,2,...,n)
Sets the horizontal positions of
the boxes. The ticks and limits are automatically set to match
the positions.

*widths* : [ scalar | array ]
Either a scalar or a vector to set the width of each box.
The default is 0.5, or ``0.15*(distance between extreme
positions)`` if that is smaller.

*patch_artist* : boolean
If *False* (default), produce boxes with the
:class:`~matplotlib.lines.Line2D` artist.
If *True*, produce boxes with the
:class:`~matplotlib.patches.Patch` artist.
around the median for notched boxplots. If bootstrap==None,
no bootstrapping is performed, and notches are calculated
using a Gaussian-based asymptotic approximation (see McGill, R.,
Tukey, J.W., and Larsen, W.A., 1978, and Kendall and Stuart,
1967). Otherwise, bootstrap specifies the number of times to
bootstrap the median to determine it's 95% confidence intervals.
Values between 1000 and 10000 are recommended.

*usermedians* : [ default None ]
An array or sequence whose first dimension (or length) is
compatible with *x*. This overrides the medians computed by
matplotlib for each element of *usermedians* that is not None.
When an element of *usermedians* == None, the median will be
computed directly as normal.

*conf_intervals* : [ default None ]
Array or sequence whose first dimension (or length) is compatible
with *x* and whose second dimension is 2. When the current element
of *conf_intervals* is not None, the notch locations computed by
matplotlib are overridden (assuming notch is True). When an element of
*conf_intervals* is None, boxplot compute notches the method
specified by the other kwargs (e.g. *bootstrap*).

*positions* : [ default 1,2,...,n ]
Sets the horizontal positions of the boxes. The ticks and limits
are automatically set to match the positions.

*widths* : [ default 0.5 ]
Either a scalar or a vector and sets the width of each box. The
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the line (unless *patch_artist* was *True*. See above.) was incorrect and that it should be taken out.

Would you mind adding some information about the keys that exist in the returning dictionary?

default is 0.5, or ``0.15*(distance between extreme positions)``
if that is smaller.

*patch_artist* : [ False (default) | True ]
If False produces boxes with the Line2D artist
If True produces boxes with the Patch artist

Returns a dictionary mapping each component of the boxplot
to a list of the :class:`~matplotlib.lines.Line2D`
instances created (unless *patch_artist* was *True*. See above.).
to a list of the :class:`matplotlib.lines.Line2D`
instances created. That disctionary has the following keys
(assuming vertical boxplots):
boxes: the main body of the boxplot showing the quartiles
and the median's confidence intervals if enabled.
medians: horizonal lines at the median of each box.
whiskers: the vertical lines extending to the most extreme,
non-outlier data points.
caps: the horizontal lines at the ends of the whiskers.
fliers: points representing data that extend beyone the
whiskers (outliers).


**Example:**

.. plot:: pyplots/boxplot_demo.py
"""
def bootstrapMedian(data, N=5000):
# determine 95% confidence intervals of the median
M = len(data)
percentile = [2.5,97.5]
estimate = np.zeros(N)
for n in range(N):
bsIndex = np.random.random_integers(0,M-1,M)
bsData = data[bsIndex]
estimate[n] = mlab.prctile(bsData, 50)
CI = mlab.prctile(estimate, percentile)
return CI

def computeConfInterval(data, med, iq, bootstrap):
if bootstrap is not None:
# Do a bootstrap estimate of notch locations.
# get conf. intervals around median
CI = bootstrapMedian(data, N=bootstrap)
notch_min = CI[0]
notch_max = CI[1]
else:
# Estimate notch locations using Gaussian-based
# asymptotic approximation.
#
# For discussion: McGill, R., Tukey, J.W.,
# and Larsen, W.A. (1978) "Variations of
# Boxplots", The American Statistician, 32:12-16.
N = len(data)
notch_min = med - 1.57*iq/np.sqrt(N)
notch_max = med + 1.57*iq/np.sqrt(N)
return notch_min, notch_max

if not self._hold: self.cla()
holdStatus = self._hold
whiskers, caps, boxes, medians, fliers = [], [], [], [], []
Expand All @@ -5567,6 +5618,38 @@ def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,
x = [x]
col = len(x)

# sanitize user-input medians
msg1 = "usermedians must either be a list/tuple or a 1d array"
msg2 = "usermedians' length must be compatible with x"
if usermedians is not None:
if hasattr(usermedians, 'shape'):
if len(usermedians.shape) != 1:
raise ValueError(msg1)
elif usermedians.shape[0] != col:
raise ValueError(msg2)
elif len(usermedians) != col:
raise ValueError(msg2)

#sanitize user-input confidence intervals
msg1 = "conf_intervals must either be a list of tuples or a 2d array"
msg2 = "conf_intervals' length must be compatible with x"
msg3 = "each conf_interval, if specificied, must have two values"
if conf_intervals is not None:
if hasattr(conf_intervals, 'shape'):
if len(conf_intervals.shape) != 2:
raise ValueError(msg1)
elif conf_intervals.shape[0] != col:
raise ValueError(msg2)
elif conf_intervals.shape[1] == 2:
raise ValueError(msg3)
else:
if len(conf_intervals) != col:
raise ValueError(msg2)
for ci in conf_intervals:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just occurred to me that we should use numpy.percentile now (assuming it's available in minimum version of numy that MPL supports.

if ci is not None and len(ci) != 2:
raise ValueError(msg3)


# get some plot info
if positions is None:
positions = range(1, col + 1)
Expand All @@ -5578,14 +5661,21 @@ def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,

# loop through columns, adding each to plot
self.hold(True)
for i,pos in enumerate(positions):
for i, pos in enumerate(positions):
d = np.ravel(x[i])
row = len(d)
if row==0:
# no data, skip this position
continue

# get median and quartiles
q1, med, q3 = mlab.prctile(d,[25,50,75])

# replace with input medians if available
if usermedians is not None:
if usermedians[i] is not None:
med = usermedians[i]

# get high extreme
iq = q3 - q1
hi_val = q3 + whis*iq
Expand Down Expand Up @@ -5625,42 +5715,16 @@ def boxplot(self, x, notch=0, sym='b+', vert=1, whis=1.5,
# get y location for median
med_y = [med, med]

# calculate 'regular' plot
if notch == 0:
# make our box vectors
box_x = [box_x_min, box_x_max, box_x_max, box_x_min, box_x_min ]
box_y = [q1, q1, q3, q3, q1 ]
# make our median line vectors
med_x = [box_x_min, box_x_max]
# calculate 'notch' plot
else:
if bootstrap is not None:
# Do a bootstrap estimate of notch locations.
def bootstrapMedian(data, N=5000):
# determine 95% confidence intervals of the median
M = len(data)
percentile = [2.5,97.5]
estimate = np.zeros(N)
for n in range(N):
bsIndex = np.random.random_integers(0,M-1,M)
bsData = data[bsIndex]
estimate[n] = mlab.prctile(bsData, 50)
CI = mlab.prctile(estimate, percentile)
return CI

# get conf. intervals around median
CI = bootstrapMedian(d, N=bootstrap)
notch_max = CI[1]
notch_min = CI[0]
if notch:
# conf. intervals from user, if available
if conf_intervals is not None and conf_intervals[i] is not None:
notch_max = np.max(conf_intervals[i])
notch_min = np.min(conf_intervals[i])
else:
# Estimate notch locations using Gaussian-based
# asymptotic approximation.
#
# For discussion: McGill, R., Tukey, J.W.,
# and Larsen, W.A. (1978) "Variations of
# Boxplots", The American Statistician, 32:12-16.
notch_max = med + 1.57*iq/np.sqrt(row)
notch_min = med - 1.57*iq/np.sqrt(row)
notch_min, notch_max = computeConfInterval(d, med, iq,
bootstrap)

# make our notched box vectors
box_x = [box_x_min, box_x_max, box_x_max, cap_x_max, box_x_max,
box_x_max, box_x_min, box_x_min, cap_x_min, box_x_min,
Expand All @@ -5670,6 +5734,13 @@ def bootstrapMedian(data, N=5000):
# make our median line vectors
med_x = [cap_x_min, cap_x_max]
med_y = [med, med]
# calculate 'regular' plot
else:
# make our box vectors
box_x = [box_x_min, box_x_max, box_x_max, box_x_min, box_x_min ]
box_y = [q1, q1, q3, q3, q1 ]
# make our median line vectors
med_x = [box_x_min, box_x_max]

def to_vc(xs,ys):
# convert arguments to verts and codes
Expand Down Expand Up @@ -5719,12 +5790,13 @@ def dopatch(xs,ys):
boxes.extend(dopatch(box_x, box_y))
else:
boxes.extend(doplot(box_x, box_y, 'b-'))

medians.extend(doplot(med_x, med_y, median_color+'-'))
fliers.extend(doplot(flier_hi_x, flier_hi, sym,
flier_lo_x, flier_lo, sym))

# fix our axes/ticks up a little
if 1 == vert:
if vert:
setticks, setlim = self.set_xticks, self.set_xlim
else:
setticks, setlim = self.set_yticks, self.set_ylim
Expand Down