Skip to content

Commit

Permalink
Merge branch 'develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
tktran committed Oct 31, 2020
2 parents 3ccfd93 + 4650967 commit c53a835
Show file tree
Hide file tree
Showing 9 changed files with 366 additions and 76 deletions.
44 changes: 44 additions & 0 deletions docs/api/model_selection/importances.rst
Expand Up @@ -111,6 +111,50 @@ Taking the mean of the importances may be undesirable for several reasons. For e
viz.fit(X, y)
viz.show()

Top and Bottom Feature Importances
----------------------------------

It may be more illuminating to the feature engineering process to identify the most or least informative features. To view only the N most informative features, specify the ``topn`` argument to the visualizer. Similar to slicing a ranked list by their importance, if ``topn`` is a postive integer, then the most highly ranked features are used. If ``topn`` is a negative integer, then the lowest ranked features are displayed instead.

.. plot::
:context: close-figs
:alt: Coefficient importances for LASSO regression

from sklearn.linear_model import Lasso
from yellowbrick.datasets import load_concrete
from yellowbrick.model_selection import FeatureImportances

# Load the regression dataset
dataset = load_concrete(return_dataset=True)
X, y = dataset.to_data()

# Title case the feature for better display and create the visualizer
labels = list(map(lambda s: s.title(), dataset.meta['features']))
viz = FeatureImportances(Lasso(), labels=labels, relative=False, topn=3)

# Fit and show the feature importances
viz.fit(X, y)
viz.show()

Using ``topn=3``, we can identify the three most informative features in the concrete dataset as ``splast``, ``cement``, and ``water``. This approach to visualization may assist with *factor analysis* - the study of how variables contribute to an overall model. Note that although ``water`` has a negative coefficient, it is the magnitude (absolute value) of the feature that matters since we are closely inspecting the negative correlation of ``water`` with the strength of concrete. Alternatively, ``topn=-3`` would reveal the three least informative features in the model. This approach is useful to model tuning similar to :doc:`rfecv`, but instead of automatically removing features, it would allow you to identify the lowest-ranked features as they change in different model instantiations. In either case, if you have many features, using ``topn`` can significantly increase the visual and analytical capacity of your analysis.

The ``topn`` parameter can also be used when ``stacked=True``. In the context of stacked feature importance graphs, the information of a feature is the width of the entire bar, or the sum of the absolute value of all coefficients contained therein.

.. plot::
:context: close-figs
:alt: Stacked per-class importances with Logistic Regression

from yellowbrick.model_selection import FeatureImportances
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

data = load_iris()
X, y = data.data, data.target

model = LogisticRegression(multi_class="auto", solver="liblinear")
viz = FeatureImportances(model, stack=True, relative=False, topn=-3)
viz.fit(X, y)
viz.show()

Discussion
----------
Expand Down
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
66 changes: 66 additions & 0 deletions tests/test_model_selection/test_importances.py
Expand Up @@ -434,6 +434,72 @@ def test_with_fitted(self):
oz.fit(X, y)
mockfit.assert_called_once_with(X, y)

def test_topn_stacked(self):
"""
Test stack plot with only the three most important features by sum of
each feature's importance across all classes
"""
X, y = load_iris(True)

viz = FeatureImportances(
LogisticRegression(solver="liblinear", random_state=222),
stack=True, topn=3
)
viz.fit(X, y)
viz.finalize()

npt.assert_equal(viz.feature_importances_.shape, (3, 3))
# Appveyor and Linux conda non-text-based differences
self.assert_images_similar(viz, tol=17.5)

def test_topn_negative_stacked(self):
"""
Test stack plot with only the three least important features by sum of
each feature's importance across all classes
"""
X, y = load_iris(True)

viz = FeatureImportances(
LogisticRegression(solver="liblinear", random_state=222),
stack=True, topn=-3
)
viz.fit(X, y)
viz.finalize()

npt.assert_equal(viz.feature_importances_.shape, (3, 3))
# Appveyor and Linux conda non-text-based differences
self.assert_images_similar(viz, tol=17.5)

def test_topn(self):
"""
Test plot with only top three important features by absolute value
"""
X, y = load_iris(True)

viz = FeatureImportances(
GradientBoostingClassifier(random_state=42), topn=3
)
viz.fit(X, y)
viz.finalize()

# Appveyor and Linux conda non-text-based differences
self.assert_images_similar(viz, tol=17.5)

def test_topn_negative(self):
"""
Test plot with only the three least important features by absolute value
"""
X, y = load_iris(True)

viz = FeatureImportances(
GradientBoostingClassifier(random_state=42), topn=-3
)
viz.fit(X, y)
viz.finalize()

# Appveyor and Linux conda non-text-based differences
self.assert_images_similar(viz, tol=17.5)


##########################################################################
## Mock Estimator
Expand Down
76 changes: 76 additions & 0 deletions tests/test_utils/test_kneed.py
Expand Up @@ -40,6 +40,8 @@
with permission by the Yellowbrick contributors.
"""

import pytest
import matplotlib.pyplot as plt
import numpy as np
from yellowbrick.utils.kneed import KneeLocator

Expand Down Expand Up @@ -132,3 +134,77 @@ def test_convex_decreasing_truncated():
curve_direction="decreasing",
)
assert kn.knee == 0.2


def test_x_equals_y():
"""Test that a runtime warning is raised when no maxima are found"""
x = range(10)
y = [1] * len(x)
with pytest.warns(RuntimeWarning):
KneeLocator(x, y)


@pytest.mark.parametrize("online, expected", [(True, 482), (False, 22)])
def test_gamma_online_offline(online, expected):
"""Tests online and offline knee detection.
Notable that a large number of samples are highly sensitive to S parameter
"""
np.random.seed(23)
n = 1000
x = range(1, n + 1)
y = sorted(np.random.gamma(0.5, 1.0, n), reverse=True)
kl = KneeLocator(x, y, curve_nature="convex", curve_direction="decreasing", online=online)
assert kl.knee == expected


def test_properties():
"""Tests that elbow and knee can be used interchangeably."""
kn = KneeLocator(
x, y_concave_inc, curve_nature="concave", curve_direction="increasing"
)
assert kn.knee == kn.elbow
assert kn.norm_knee == kn.norm_elbow
# pytest compares all elements in each list.
assert kn.all_knees == kn.all_elbows
assert kn.all_norm_knees == kn.all_norm_elbows


def test_plot_knee_normalized():
"""Test that plotting is functional"""
with np.errstate(divide="ignore"):
x = np.linspace(0.0, 1, 10)
y = np.true_divide(-1, x + 0.1) + 5
kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
num_figures_before = plt.gcf().number
kl.plot_knee_normalized()
num_figures_after = plt.gcf().number
assert num_figures_before < num_figures_after


def test_plot_knee():
"""Test that plotting is functional"""
with np.errstate(divide="ignore"):
x = np.linspace(0.0, 1, 10)
y = np.true_divide(-1, x + 0.1) + 5
kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
num_figures_before = plt.gcf().number
kl.plot_knee()
num_figures_after = plt.gcf().number
assert num_figures_before < num_figures_after


def test_y():
"""Test the y value"""
with np.errstate(divide="ignore"):
x = np.linspace(0.0, 1, 10)
y = np.true_divide(-1, x + 0.1) + 5
kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
assert kl.knee_y == pytest.approx(1.897, 0.03)
assert kl.all_knees_y[0] == pytest.approx(1.897, 0.03)
assert kl.norm_knee_y == pytest.approx(0.758, 0.03)
assert kl.all_norm_knees_y[0] == pytest.approx(0.758, 0.03)

assert kl.elbow_y == pytest.approx(1.897, 0.03)
assert kl.all_elbows_y[0] == pytest.approx(1.897, 0.03)
assert kl.norm_elbow_y == pytest.approx(0.758, 0.03)
assert kl.all_norm_elbows_y[0] == pytest.approx(0.758, 0.03)
63 changes: 60 additions & 3 deletions yellowbrick/model_selection/importances.py
Expand Up @@ -27,7 +27,7 @@
from yellowbrick.base import ModelVisualizer
from yellowbrick.style.colors import resolve_colors
from yellowbrick.utils import is_dataframe, is_classifier
from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning
from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning, YellowbrickValueError

##########################################################################
## Feature Visualizer
Expand Down Expand Up @@ -92,6 +92,10 @@ class FeatureImportances(ModelVisualizer):
modified. If 'auto' (default), a helper method will check if the estimator
is fitted before fitting it again.
topn : int, default=None
Display only the top N results with a positive integer, or the bottom N
results with a negative integer. If None or 0, all results are shown.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Expand Down Expand Up @@ -128,6 +132,7 @@ def __init__(
colors=None,
colormap=None,
is_fitted="auto",
topn=None,
**kwargs
):
# Initialize the visualizer bases
Expand All @@ -144,6 +149,7 @@ def __init__(
stack=stack,
colors=colors,
colormap=colormap,
topn=topn
)

def fit(self, X, y=None, **kwargs):
Expand Down Expand Up @@ -218,12 +224,33 @@ def fit(self, X, y=None, **kwargs):
else:
self.features_ = np.array(self.labels)

if self.topn and self.topn > self.features_.shape[0]:
raise YellowbrickValueError(
"topn '{}' cannot be greater than the number of "
"features '{}'".format(self.topn, self.features_.shape[0])
)

# Sort the features and their importances
if self.stack:
sort_idx = np.argsort(np.mean(self.feature_importances_, 0))
if self.topn:
abs_sort_idx = np.argsort(
np.sum(np.absolute(self.feature_importances_), 0)
)
sort_idx = self._reduce_topn(abs_sort_idx)
else:
sort_idx = np.argsort(np.mean(self.feature_importances_, 0))

self.features_ = self.features_[sort_idx]
self.feature_importances_ = self.feature_importances_[:, sort_idx]
else:
if self.topn:
abs_sort_idx = np.argsort(np.absolute(self.feature_importances_))
abs_sort_idx = self._reduce_topn(abs_sort_idx)

self.features_ = self.features_[abs_sort_idx]
self.feature_importances_ = self.feature_importances_[abs_sort_idx]

# Sort features by value (sorting a second time if topn)
sort_idx = np.argsort(self.feature_importances_)
self.features_ = self.features_[sort_idx]
self.feature_importances_ = self.feature_importances_[sort_idx]
Expand Down Expand Up @@ -276,7 +303,7 @@ def finalize(self, **kwargs):
# Set the title
self.set_title(
"Feature Importances of {} Features using {}".format(
len(self.features_), self.name
self._get_topn_title(), self.name
)
)

Expand Down Expand Up @@ -346,6 +373,30 @@ def _is_fitted(self):
"""
return hasattr(self, "feature_importances_") and hasattr(self, "features_")

def _reduce_topn(self, arr):
"""
Return only the top or bottom N items within a sliceable array/list.
Assumes that arr is in ascending order.
"""
if self.topn > 0:
arr = arr[-self.topn:]
elif self.topn < 0:
arr = arr[:-self.topn]
return arr

def _get_topn_title(self):
"""
Return an appropriate title for the plot: Top N, Bottom N, or N
"""
if self.topn:
if self.topn > 0:
return "Top {}".format(len(self.features_))
else:
return "Bottom {}".format(len(self.features_))
else:
return str(len(self.features_))


##########################################################################
## Quick Method
Expand All @@ -365,6 +416,7 @@ def feature_importances(
colors=None,
colormap=None,
is_fitted="auto",
topn=None,
show=True,
**kwargs
):
Expand Down Expand Up @@ -431,6 +483,10 @@ def feature_importances(
call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
calls ``finalize()``
topn : int, default=None
Display only the top N results with a positive integer, or the bottom N
results with a negative integer. If None or 0, all results are shown.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Expand All @@ -452,6 +508,7 @@ def feature_importances(
colors=colors,
colormap=colormap,
is_fitted=is_fitted,
topn=topn,
**kwargs
)

Expand Down

0 comments on commit c53a835

Please sign in to comment.