Skip to content

Commit

Permalink
Add Q-Q plot to the ResidualsPlot (#853) (#1042)
Browse files Browse the repository at this point in the history
* Add Q-Q plot to the ResidualsPlot (#853)

Adds functionality to plot Q-Q plot side by side with residuals plot and verifies that it works as expected

* Add qqplot to docstring of ResidualsPlot.

* Add qqplot to the Quick Method of ResidualsPlot.

* Add qqplot to documentatition of ResidualsPlot.

* remove finalize() in test_residuals_plot_QQ_plot

* Reset tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_QQ_plot.png

* Reset test_residuals_plot_QQ_plot.png using different virtual environment.

* Reset test_residuals_plot_QQ_plot.png using mpl==3.2.1

Co-authored-by: Larry Gray <lwgray@gmail.com>
  • Loading branch information
VladSkripniuk and lwgray committed Apr 6, 2020
1 parent 19a8345 commit bc2c316
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 0 deletions.
13 changes: 13 additions & 0 deletions docs/api/regressor/residuals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ Note that if the histogram is not desired, it can be turned off with the ``hist=

.. warning:: The histogram on the residuals plot requires matplotlib 2.0.2 or greater. If you are using an earlier version of matplotlib, simply set the ``hist=False`` flag so that the histogram is not drawn.

Histogram can be replaced with a Q-Q plot, which is a common way to check that residuals are normally distributed. If the residuals are normally distributed, then their quantiles when plotted against quantiles of normal distribution should form a straight line. The example below shows, how Q-Q plot can be drawn with a ``qqplot=True`` flag. Notice that ``hist`` has to be set to ``False`` in this case.

.. plot::
:context: close-figs
:alt: Residuals Plot on the Concrete dataset with a Q-Q plot

visualizer = ResidualsPlot(model, hist=False, qqplot=True)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()



Quick Method
------------

Expand Down
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 26 additions & 0 deletions tests/test_regressor/test_residuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,32 @@ def test_residuals_plot(self):

self.assert_images_similar(visualizer)

@pytest.mark.xfail(
IS_WINDOWS_OR_CONDA,
reason="font rendering different in OS and/or Python; see #892",
)
def test_residuals_plot_QQ_plot(self):
"""
Image similarity of residuals and Q-Q plot on random data with OLS
"""
_, ax = plt.subplots()

visualizer = ResidualsPlot(LinearRegression(), hist=False,
qqplot=True, ax=ax)

visualizer.fit(self.data.X.train, self.data.y.train)
visualizer.score(self.data.X.test, self.data.y.test)

self.assert_images_similar(visualizer)

def test_either_hist_or_QQ_plot(self):
"""
Setting both hist=True and qqplot=True raises exception.
"""
with pytest.raises(YellowbrickValueError,
match="Set either hist or qqplot to False"):
ResidualsPlot(LinearRegression(), hist=True, qqplot=True)

@pytest.mark.xfail(
sys.platform == "win32", reason="images not close on windows (RMSE=32)"
)
Expand Down
65 changes: 65 additions & 0 deletions yellowbrick/regressor/residuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

import matplotlib.pyplot as plt

from scipy.stats import probplot

try:
# Only available in Matplotlib >= 2.0.2
from mpl_toolkits.axes_grid1 import make_axes_locatable
Expand Down Expand Up @@ -434,6 +436,12 @@ class ResidualsPlot(RegressionScoreVisualizer):
If set to 'density', the probability density function will be plotted.
If set to True or 'frequency' then the frequency will be plotted.
qqplot : {True, False}, default: False
Draw a Q-Q plot on the right side of the figure, comparing the quantiles
of the residuals against quantiles of a standard normal distribution.
Q-Q plot and histogram of residuals can not be plotted simultaneously,
either `hist` or `qqplot` has to be set to False.
train_color : color, default: 'b'
Residuals for training data are ploted with this color but also
given an opacity of 0.5 to ensure that the test data residuals
Expand Down Expand Up @@ -502,6 +510,7 @@ def __init__(
model,
ax=None,
hist=True,
qqplot=False,
train_color="b",
test_color="g",
line_color=LINE_COLOR,
Expand Down Expand Up @@ -531,9 +540,25 @@ def __init__(
"False, 'density', or 'frequency'".format(hist)
)

self.qqplot = qqplot
if self.qqplot not in {True, False}:
raise YellowbrickValueError(
"'{}' is an invalid argument for qqplot, use True, "
" or False".format(hist)
)

if self.hist in {True, "density", "frequency"} and self.qqplot in {True}:
raise YellowbrickValueError(
"Set either hist or qqplot to False, can not plot "
"both of them simultaneously."
)

if self.hist in {True, "density", "frequency"}:
self.hax # If hist is True, test the version availability

if self.qqplot in {True}:
self.qqax # If qqplot is True, test the version availability

# Store labels and colors for the legend ordered by call
self._labels, self._colors = [], []

Expand All @@ -560,6 +585,26 @@ def hax(self):

return hax

@memoized
def qqax(self):
"""
Returns the Q-Q plot axes, creating it only on demand.
"""
if make_axes_locatable is None:
raise YellowbrickValueError(
(
"residuals histogram requires matplotlib 2.0.2 or greater "
"please upgrade matplotlib or set qqplot=False on the visualizer"
)
)

divider = make_axes_locatable(self.ax)

qqax = divider.append_axes("right", size=2, pad=0.25, sharey=self.ax)
qqax.yaxis.tick_right()

return qqax

def fit(self, X, y, **kwargs):
"""
Parameters
Expand Down Expand Up @@ -670,6 +715,12 @@ def draw(self, y_pred, residuals, train=False, **kwargs):
residuals, bins=50, orientation="horizontal", density=True, color=color
)

# Add residuals histogram
if self.qqplot in {True}:
osm, osr = probplot(residuals, dist='norm', fit=False)

self.qqax.scatter(osm, osr, c=color, alpha=alpha, label=label)

# Ensure the current axes is always the main residuals axes
plt.sca(self.ax)
return self.ax
Expand Down Expand Up @@ -705,6 +756,12 @@ def finalize(self, **kwargs):
self.hax.axhline(y=0, c=self.colors["line"])
self.hax.set_xlabel("Distribution")

# Finalize the histogram axes
if self.qqplot:
self.qqax.set_title("Q-Q plot")
self.qqax.set_xlabel("Theoretical quantiles")
self.qqax.set_ylabel("Observed quantiles")


##########################################################################
## Quick Method
Expand All @@ -719,6 +776,7 @@ def residuals_plot(
y_test=None,
ax=None,
hist=True,
qqplot=False,
train_color="b",
test_color="g",
line_color=LINE_COLOR,
Expand Down Expand Up @@ -772,6 +830,12 @@ def residuals_plot(
If set to 'density', the probability density function will be plotted.
If set to True or 'frequency' then the frequency will be plotted.
qqplot : {True, False}, default: False
Draw a Q-Q plot on the right side of the figure, comparing the quantiles
of the residuals against quantiles of a standard normal distribution.
Q-Q plot and histogram of residuals can not be plotted simultaneously,
either `hist` or `qqplot` has to be set to False.
train_color : color, default: 'b'
Residuals for training data are ploted with this color but also
given an opacity of 0.5 to ensure that the test data residuals
Expand Down Expand Up @@ -822,6 +886,7 @@ def residuals_plot(
model=model,
ax=ax,
hist=hist,
qqplot=qqplot,
train_color=train_color,
test_color=test_color,
line_color=line_color,
Expand Down

0 comments on commit bc2c316

Please sign in to comment.