diff --git a/docs/api/model_selection/importances.rst b/docs/api/model_selection/importances.rst index 325bf337c..d2f58441f 100644 --- a/docs/api/model_selection/importances.rst +++ b/docs/api/model_selection/importances.rst @@ -111,6 +111,50 @@ Taking the mean of the importances may be undesirable for several reasons. For e viz.fit(X, y) viz.show() +Top and Bottom Feature Importances +---------------------------------- + +It may be more illuminating to the feature engineering process to identify the most or least informative features. To view only the N most informative features, specify the ``topn`` argument to the visualizer. Similar to slicing a ranked list by their importance, if ``topn`` is a postive integer, then the most highly ranked features are used. If ``topn`` is a negative integer, then the lowest ranked features are displayed instead. + +.. plot:: + :context: close-figs + :alt: Coefficient importances for LASSO regression + + from sklearn.linear_model import Lasso + from yellowbrick.datasets import load_concrete + from yellowbrick.model_selection import FeatureImportances + + # Load the regression dataset + dataset = load_concrete(return_dataset=True) + X, y = dataset.to_data() + + # Title case the feature for better display and create the visualizer + labels = list(map(lambda s: s.title(), dataset.meta['features'])) + viz = FeatureImportances(Lasso(), labels=labels, relative=False, topn=3) + + # Fit and show the feature importances + viz.fit(X, y) + viz.show() + +Using ``topn=3``, we can identify the three most informative features in the concrete dataset as ``splast``, ``cement``, and ``water``. This approach to visualization may assist with *factor analysis* - the study of how variables contribute to an overall model. Note that although ``water`` has a negative coefficient, it is the magnitude (absolute value) of the feature that matters since we are closely inspecting the negative correlation of ``water`` with the strength of concrete. Alternatively, ``topn=-3`` would reveal the three least informative features in the model. This approach is useful to model tuning similar to :doc:`rfecv`, but instead of automatically removing features, it would allow you to identify the lowest-ranked features as they change in different model instantiations. In either case, if you have many features, using ``topn`` can significantly increase the visual and analytical capacity of your analysis. + +The ``topn`` parameter can also be used when ``stacked=True``. In the context of stacked feature importance graphs, the information of a feature is the width of the entire bar, or the sum of the absolute value of all coefficients contained therein. + +.. plot:: + :context: close-figs + :alt: Stacked per-class importances with Logistic Regression + + from yellowbrick.model_selection import FeatureImportances + from sklearn.linear_model import LogisticRegression + from sklearn.datasets import load_iris + + data = load_iris() + X, y = data.data, data.target + + model = LogisticRegression(multi_class="auto", solver="liblinear") + viz = FeatureImportances(model, stack=True, relative=False, topn=-3) + viz.fit(X, y) + viz.show() Discussion ---------- diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn.png b/tests/baseline_images/test_model_selection/test_importances/test_topn.png new file mode 100644 index 000000000..75fe59b7e Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_topn.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png new file mode 100644 index 000000000..4983b418e Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png new file mode 100644 index 000000000..4d81f6997 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png new file mode 100644 index 000000000..991511811 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png differ diff --git a/tests/test_model_selection/test_importances.py b/tests/test_model_selection/test_importances.py index a556edcbf..8db2d7a5a 100644 --- a/tests/test_model_selection/test_importances.py +++ b/tests/test_model_selection/test_importances.py @@ -434,6 +434,72 @@ def test_with_fitted(self): oz.fit(X, y) mockfit.assert_called_once_with(X, y) + def test_topn_stacked(self): + """ + Test stack plot with only the three most important features by sum of + each feature's importance across all classes + """ + X, y = load_iris(True) + + viz = FeatureImportances( + LogisticRegression(solver="liblinear", random_state=222), + stack=True, topn=3 + ) + viz.fit(X, y) + viz.finalize() + + npt.assert_equal(viz.feature_importances_.shape, (3, 3)) + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=17.5) + + def test_topn_negative_stacked(self): + """ + Test stack plot with only the three least important features by sum of + each feature's importance across all classes + """ + X, y = load_iris(True) + + viz = FeatureImportances( + LogisticRegression(solver="liblinear", random_state=222), + stack=True, topn=-3 + ) + viz.fit(X, y) + viz.finalize() + + npt.assert_equal(viz.feature_importances_.shape, (3, 3)) + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=17.5) + + def test_topn(self): + """ + Test plot with only top three important features by absolute value + """ + X, y = load_iris(True) + + viz = FeatureImportances( + GradientBoostingClassifier(random_state=42), topn=3 + ) + viz.fit(X, y) + viz.finalize() + + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=17.5) + + def test_topn_negative(self): + """ + Test plot with only the three least important features by absolute value + """ + X, y = load_iris(True) + + viz = FeatureImportances( + GradientBoostingClassifier(random_state=42), topn=-3 + ) + viz.fit(X, y) + viz.finalize() + + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=17.5) + ########################################################################## ## Mock Estimator diff --git a/yellowbrick/model_selection/importances.py b/yellowbrick/model_selection/importances.py index b9e3ad1af..6719af91b 100644 --- a/yellowbrick/model_selection/importances.py +++ b/yellowbrick/model_selection/importances.py @@ -27,7 +27,7 @@ from yellowbrick.base import ModelVisualizer from yellowbrick.style.colors import resolve_colors from yellowbrick.utils import is_dataframe, is_classifier -from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning +from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning, YellowbrickValueError ########################################################################## ## Feature Visualizer @@ -92,6 +92,10 @@ class FeatureImportances(ModelVisualizer): modified. If 'auto' (default), a helper method will check if the estimator is fitted before fitting it again. + topn : int, default=None + Display only the top N results with a positive integer, or the bottom N + results with a negative integer. If None or 0, all results are shown. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -128,6 +132,7 @@ def __init__( colors=None, colormap=None, is_fitted="auto", + topn=None, **kwargs ): # Initialize the visualizer bases @@ -144,6 +149,7 @@ def __init__( stack=stack, colors=colors, colormap=colormap, + topn=topn ) def fit(self, X, y=None, **kwargs): @@ -218,12 +224,33 @@ def fit(self, X, y=None, **kwargs): else: self.features_ = np.array(self.labels) + if self.topn and self.topn > self.features_.shape[0]: + raise YellowbrickValueError( + "topn '{}' cannot be greater than the number of " + "features '{}'".format(self.topn, self.features_.shape[0]) + ) + # Sort the features and their importances if self.stack: - sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) + if self.topn: + abs_sort_idx = np.argsort( + np.sum(np.absolute(self.feature_importances_), 0) + ) + sort_idx = self._reduce_topn(abs_sort_idx) + else: + sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) + self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[:, sort_idx] else: + if self.topn: + abs_sort_idx = np.argsort(np.absolute(self.feature_importances_)) + abs_sort_idx = self._reduce_topn(abs_sort_idx) + + self.features_ = self.features_[abs_sort_idx] + self.feature_importances_ = self.feature_importances_[abs_sort_idx] + + # Sort features by value (sorting a second time if topn) sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] @@ -276,7 +303,7 @@ def finalize(self, **kwargs): # Set the title self.set_title( "Feature Importances of {} Features using {}".format( - len(self.features_), self.name + self._get_topn_title(), self.name ) ) @@ -346,6 +373,30 @@ def _is_fitted(self): """ return hasattr(self, "feature_importances_") and hasattr(self, "features_") + def _reduce_topn(self, arr): + """ + Return only the top or bottom N items within a sliceable array/list. + + Assumes that arr is in ascending order. + """ + if self.topn > 0: + arr = arr[-self.topn:] + elif self.topn < 0: + arr = arr[:-self.topn] + return arr + + def _get_topn_title(self): + """ + Return an appropriate title for the plot: Top N, Bottom N, or N + """ + if self.topn: + if self.topn > 0: + return "Top {}".format(len(self.features_)) + else: + return "Bottom {}".format(len(self.features_)) + else: + return str(len(self.features_)) + ########################################################################## ## Quick Method @@ -365,6 +416,7 @@ def feature_importances( colors=None, colormap=None, is_fitted="auto", + topn=None, show=True, **kwargs ): @@ -431,6 +483,10 @@ def feature_importances( call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply calls ``finalize()`` + topn : int, default=None + Display only the top N results with a positive integer, or the bottom N + results with a negative integer. If None or 0, all results are shown. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -452,6 +508,7 @@ def feature_importances( colors=colors, colormap=colormap, is_fitted=is_fitted, + topn=topn, **kwargs )