Skip to content

Commit

Permalink
identity line in residuals, conditional freq dist
Browse files Browse the repository at this point in the history
  • Loading branch information
bbengfort committed Jun 25, 2017
1 parent 1001b6f commit 0271d38
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 71 deletions.
47 changes: 42 additions & 5 deletions yellowbrick/bestfit.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,43 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs):
The estimator function can be one of the following:
'linear': Uses OLS to fit the regression
'quadratic': Uses OLS with Polynomial order 2
'exponential': Not implemented yet
'log': Not implemented yet
'select_best': Selects the best fit via MSE
- ``'linear'``: Uses OLS to fit the regression
- ``'quadratic'``: Uses OLS with Polynomial order 2
- ``'exponential'``: Not implemented yet
- ``'log'``: Not implemented yet
- ``'select_best'``: Selects the best fit via MSE
The remaining keyword arguments are passed to ax.plot to define and
describe the line of best fit.
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with m features
y : ndarray or Series of length n
An array or series of target or class values
ax : matplotlib Axes, default: None
The axis to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
estimator : string, default: 'linear'
The name of the estimator function used to draw the best fit line.
The estimator can currently be one of linear, quadratic, exponential,
log, or select_best. The select best method uses the minimum MSE to
select the best fit line.
kwargs : dict
Keyword arguments to pass to the matplotlib plot function to style and
label the line of best fit. By default, the standard line color is
used unless the color keyword argument is passed in.
Returns
-------
ax : matplotlib Axes
The axes with the line drawn on it.
"""

# Estimators are the types of best fit lines that can be drawn.
Expand Down Expand Up @@ -118,6 +147,9 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs):
if 'c' not in kwargs and 'color' not in kwargs:
kwargs['color'] = LINE_COLOR

# Get the current working axes
ax = ax or plt.gca()

# Plot line of best fit onto the axes that were passed in.
# TODO: determine if xlim or X.min(), X.max() are better params
xr = np.linspace(*ax.get_xlim(), num=100)
Expand Down Expand Up @@ -206,6 +238,11 @@ def draw_identity_line(ax=None, dynamic=True, **kwargs):
ax : matplotlib Axes
The axes with the line drawn on it.
Notes
-----
.. seealso:: `StackOverflow discussion: Does matplotlib have a function for drawing diagonal lines in axis coordinates? <https://stackoverflow.com/questions/22104256/does-matplotlib-have-a-function-for-drawing-diagonal-lines-in-axis-coordinates>`_
"""

# Get the current working axes
Expand Down
8 changes: 4 additions & 4 deletions yellowbrick/features/rankd.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,14 @@ class Rank1D(RankDBase):
Attributes
----------
ranks_ : ndarray
``ranks_`` : ndarray
An array of rank scores with shape (n,), where n is the
number of features. It is computed during `fit`.
Examples
--------
>>> visualizer = Rank2D()
>>> visualizer = Rank1D()
>>> visualizer.fit(X, y)
>>> visualizer.transform(X)
>>> visualizer.poof()
Expand Down Expand Up @@ -468,9 +468,9 @@ class Rank2D(RankDBase):
Attributes
----------
ranks_ : ndarray
``ranks_`` : ndarray
An array of rank scores with shape (n,n), where n is the
number of features. It is computed during `fit`.
number of features. It is computed during ``fit``.
Examples
--------
Expand Down
67 changes: 56 additions & 11 deletions yellowbrick/regressor/residuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ class PredictionError(RegressionScoreVisualizer):
shared_limits to False, but note that this will distort the figure
and should be accounted for during analysis.
besfit : bool, default: True
Draw a linear best fit line to estimate the correlation between the
predicted and measured value of the target variable. The color of
the bestfit line is determined by the ``line_color`` argument.
identity: bool, default: True
Draw the 45 degree identity line, y=x in order to better show the
relationship or pattern of the residuals. E.g. to estimate if the
model is over- or under- estimating the given values. The color of the
identity line is a muted version of the ``line_color`` argument.
point_color : color
Defines the color of the error points; can be any matplotlib color.
Expand Down Expand Up @@ -91,7 +102,8 @@ class PredictionError(RegressionScoreVisualizer):
its primary entry point is the `score()` method.
"""

def __init__(self, model, ax=None, shared_limits=True, **kwargs):
def __init__(self, model, ax=None, shared_limits=True,
bestfit=True, identity=True, **kwargs):
# Initialize the visualizer
super(PredictionError, self).__init__(model, ax=ax, **kwargs)

Expand All @@ -103,6 +115,8 @@ def __init__(self, model, ax=None, shared_limits=True, **kwargs):

# Drawing arguments
self.shared_limits = shared_limits
self.bestfit = bestfit
self.identity = identity

def score(self, X, y=None, **kwargs):
"""
Expand Down Expand Up @@ -146,10 +160,11 @@ def draw(self, y, y_pred):

# TODO If score is happening inside a loop, draw would get called multiple times.
# Ideally we'd want the best fit line to be drawn only once
draw_best_fit(
y, y_pred, self.ax, 'linear', ls='--', lw=2,
c=self.colors['line'], label='best fit'
)
if self.bestfit:
draw_best_fit(
y, y_pred, self.ax, 'linear', ls='--', lw=2,
c=self.colors['line'], label='best fit'
)

# Set the axes limits based on the range of X and Y data
# NOTE: shared_limits will be accounted for in finalize()
Expand All @@ -171,12 +186,6 @@ def finalize(self, **kwargs):
# Set the title on the plot
self.set_title('Prediction Error for {}'.format(self.name))

# Draw the 45 degree line
draw_identity_line(
ax=self.ax, ls='--', lw=2, c=self.colors['line'],
alpha=0.5, label="identity"
)

# Square the axes to ensure a 45 degree line
if self.shared_limits:
# Get the current limits
Expand All @@ -196,6 +205,13 @@ def finalize(self, **kwargs):
# Ensure the aspect ratio is square
self.ax.set_aspect('equal', adjustable='box')

# Draw the 45 degree line
if self.identity:
draw_identity_line(
ax=self.ax, ls='--', lw=2, c=self.colors['line'],
alpha=0.5, label="identity"
)

# Set the axes labels
self.ax.set_ylabel('Predicted')
self.ax.set_xlabel('Measured')
Expand Down Expand Up @@ -226,6 +242,35 @@ def prediction_error(model, X, y=None, ax=None, **kwargs):
ax : matplotlib Axes
The axes to plot the figure on.
shared_limits : bool, default: True
If shared_limits is True, the range of the X and Y axis limits will
be identical, creating a square graphic with a true 45 degree line.
In this form, it is easier to diagnose under- or over- prediction,
though the figure will become more sparse. To localize points, set
shared_limits to False, but note that this will distort the figure
and should be accounted for during analysis.
besfit : bool, default: True
Draw a linear best fit line to estimate the correlation between the
predicted and measured value of the target variable. The color of
the bestfit line is determined by the ``line_color`` argument.
identity: bool, default: True
Draw the 45 degree identity line, y=x in order to better show the
relationship or pattern of the residuals. E.g. to estimate if the
model is over- or under- estimating the given values. The color of the
identity line is a muted version of the ``line_color`` argument.
point_color : color
Defines the color of the error points; can be any matplotlib color.
line_color : color
Defines the color of the best fit line; can be any matplotlib color.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Returns
-------
ax : matplotlib Axes
Expand Down

0 comments on commit 0271d38

Please sign in to comment.