Merge branch 'develop' into develop

DistrictDataLabs · Oct 31, 2020 · c53a835 · c53a835
2 parents 3ccfd93 + 4650967
commit c53a835
Show file tree

Hide file tree

Showing 9 changed files with 366 additions and 76 deletions.
diff --git a/docs/api/model_selection/importances.rst b/docs/api/model_selection/importances.rst
@@ -111,6 +111,50 @@ Taking the mean of the importances may be undesirable for several reasons. For e
     viz.fit(X, y)
     viz.show()
 
+Top and Bottom Feature Importances
+----------------------------------
+
+It may be more illuminating to the feature engineering process to identify the most or least informative features. To view only the N most informative features, specify the ``topn`` argument to the visualizer. Similar to slicing a ranked list by their importance, if ``topn`` is a postive integer, then the most highly ranked features are used. If ``topn`` is a negative integer, then the lowest ranked features are displayed instead.
+
+.. plot::
+    :context: close-figs
+    :alt: Coefficient importances for LASSO regression
+
+    from sklearn.linear_model import Lasso
+    from yellowbrick.datasets import load_concrete
+    from yellowbrick.model_selection import FeatureImportances
+
+    # Load the regression dataset
+    dataset = load_concrete(return_dataset=True)
+    X, y = dataset.to_data()
+
+    # Title case the feature for better display and create the visualizer
+    labels = list(map(lambda s: s.title(), dataset.meta['features']))
+    viz = FeatureImportances(Lasso(), labels=labels, relative=False, topn=3)
+
+    # Fit and show the feature importances
+    viz.fit(X, y)
+    viz.show()
+
+Using ``topn=3``, we can identify the three most informative features in the concrete dataset as ``splast``, ``cement``, and ``water``. This approach to visualization may assist with *factor analysis* - the study of how variables contribute to an overall model. Note that although ``water`` has a negative coefficient, it is the magnitude (absolute value) of the feature that matters since we are closely inspecting the negative correlation of ``water`` with the strength of concrete. Alternatively, ``topn=-3`` would reveal the three least informative features in the model. This approach is useful to model tuning similar to :doc:`rfecv`, but instead of automatically removing features, it would allow you to identify the lowest-ranked features as they change in different model instantiations. In either case, if you have many features, using ``topn`` can significantly increase the visual and analytical capacity of your analysis.
+
+The ``topn`` parameter can also be used when ``stacked=True``. In the context of stacked feature importance graphs, the information of a feature is the width of the entire bar, or the sum of the absolute value of all coefficients contained therein.
+
+.. plot::
+    :context: close-figs
+    :alt: Stacked per-class importances with Logistic Regression
+
+    from yellowbrick.model_selection import FeatureImportances
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.datasets import load_iris
+
+    data = load_iris()
+    X, y = data.data, data.target
+
+    model = LogisticRegression(multi_class="auto", solver="liblinear")
+    viz = FeatureImportances(model, stack=True, relative=False, topn=-3)
+    viz.fit(X, y)
+    viz.show()
 
 Discussion
 ----------

diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn.png b/tests/baseline_images/test_model_selection/test_importances/test_topn.png
diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png
diff --git a/...ine_images/test_model_selection/test_importances/test_topn_negative_stacked.png b/...ine_images/test_model_selection/test_importances/test_topn_negative_stacked.png
diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png
diff --git a/tests/test_model_selection/test_importances.py b/tests/test_model_selection/test_importances.py
@@ -434,6 +434,72 @@ def test_with_fitted(self):
             oz.fit(X, y)
             mockfit.assert_called_once_with(X, y)
 
+    def test_topn_stacked(self):
+        """
+        Test stack plot with only the three most important features by sum of
+        each feature's importance across all classes
+        """
+        X, y = load_iris(True)
+
+        viz = FeatureImportances(
+            LogisticRegression(solver="liblinear", random_state=222),
+            stack=True, topn=3
+        )
+        viz.fit(X, y)
+        viz.finalize()
+
+        npt.assert_equal(viz.feature_importances_.shape, (3, 3))
+        # Appveyor and Linux conda non-text-based differences
+        self.assert_images_similar(viz, tol=17.5)
+
+    def test_topn_negative_stacked(self):
+        """
+        Test stack plot with only the three least important features by sum of
+        each feature's importance across all classes
+        """
+        X, y = load_iris(True)
+
+        viz = FeatureImportances(
+            LogisticRegression(solver="liblinear", random_state=222),
+            stack=True, topn=-3
+        )
+        viz.fit(X, y)
+        viz.finalize()
+
+        npt.assert_equal(viz.feature_importances_.shape, (3, 3))
+        # Appveyor and Linux conda non-text-based differences
+        self.assert_images_similar(viz, tol=17.5)
+
+    def test_topn(self):
+        """
+        Test plot with only top three important features by absolute value
+        """
+        X, y = load_iris(True)
+
+        viz = FeatureImportances(
+            GradientBoostingClassifier(random_state=42), topn=3
+        )
+        viz.fit(X, y)
+        viz.finalize()
+
+        # Appveyor and Linux conda non-text-based differences
+        self.assert_images_similar(viz, tol=17.5)
+
+    def test_topn_negative(self):
+        """
+        Test plot with only the three least important features by absolute value
+        """
+        X, y = load_iris(True)
+
+        viz = FeatureImportances(
+            GradientBoostingClassifier(random_state=42), topn=-3
+        )
+        viz.fit(X, y)
+        viz.finalize()
+
+        # Appveyor and Linux conda non-text-based differences
+        self.assert_images_similar(viz, tol=17.5)
+
 
 ##########################################################################
 ## Mock Estimator

diff --git a/tests/test_utils/test_kneed.py b/tests/test_utils/test_kneed.py
@@ -40,6 +40,8 @@
 with permission by the Yellowbrick contributors.
 """
 
+import pytest
+import matplotlib.pyplot as plt
 import numpy as np
 from yellowbrick.utils.kneed import KneeLocator
 
@@ -132,3 +134,77 @@ def test_convex_decreasing_truncated():
         curve_direction="decreasing",
     )
     assert kn.knee == 0.2
+
+
+def test_x_equals_y():
+    """Test that a runtime warning is raised when no maxima are found"""
+    x = range(10)
+    y = [1] * len(x)
+    with pytest.warns(RuntimeWarning):
+        KneeLocator(x, y)
+
+
+@pytest.mark.parametrize("online, expected", [(True, 482), (False, 22)])
+def test_gamma_online_offline(online, expected):
+    """Tests online and offline knee detection.
+    Notable that a large number of samples are highly sensitive to S parameter
+    """
+    np.random.seed(23)
+    n = 1000
+    x = range(1, n + 1)
+    y = sorted(np.random.gamma(0.5, 1.0, n), reverse=True)
+    kl = KneeLocator(x, y, curve_nature="convex", curve_direction="decreasing", online=online)
+    assert kl.knee == expected
+
+
+def test_properties():
+    """Tests that elbow and knee can be used interchangeably."""
+    kn = KneeLocator(
+        x, y_concave_inc, curve_nature="concave", curve_direction="increasing"
+    )
+    assert kn.knee == kn.elbow
+    assert kn.norm_knee == kn.norm_elbow
+    # pytest compares all elements in each list.
+    assert kn.all_knees == kn.all_elbows
+    assert kn.all_norm_knees == kn.all_norm_elbows
+
+
+def test_plot_knee_normalized():
+    """Test that plotting is functional"""
+    with np.errstate(divide="ignore"):
+        x = np.linspace(0.0, 1, 10)
+        y = np.true_divide(-1, x + 0.1) + 5
+    kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
+    num_figures_before = plt.gcf().number
+    kl.plot_knee_normalized()
+    num_figures_after = plt.gcf().number
+    assert num_figures_before < num_figures_after
+
+
+def test_plot_knee():
+    """Test that plotting is functional"""
+    with np.errstate(divide="ignore"):
+        x = np.linspace(0.0, 1, 10)
+        y = np.true_divide(-1, x + 0.1) + 5
+    kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
+    num_figures_before = plt.gcf().number
+    kl.plot_knee()
+    num_figures_after = plt.gcf().number
+    assert num_figures_before < num_figures_after
+
+
+def test_y():
+    """Test the y value"""
+    with np.errstate(divide="ignore"):
+        x = np.linspace(0.0, 1, 10)
+        y = np.true_divide(-1, x + 0.1) + 5
+    kl = KneeLocator(x, y, S=1.0, curve_nature="concave")
+    assert kl.knee_y == pytest.approx(1.897, 0.03)
+    assert kl.all_knees_y[0] == pytest.approx(1.897, 0.03)
+    assert kl.norm_knee_y == pytest.approx(0.758, 0.03)
+    assert kl.all_norm_knees_y[0] == pytest.approx(0.758, 0.03)
+
+    assert kl.elbow_y == pytest.approx(1.897, 0.03)
+    assert kl.all_elbows_y[0] == pytest.approx(1.897, 0.03)
+    assert kl.norm_elbow_y == pytest.approx(0.758, 0.03)
+    assert kl.all_norm_elbows_y[0] == pytest.approx(0.758, 0.03)
diff --git a/yellowbrick/model_selection/importances.py b/yellowbrick/model_selection/importances.py
@@ -27,7 +27,7 @@
 from yellowbrick.base import ModelVisualizer
 from yellowbrick.style.colors import resolve_colors
 from yellowbrick.utils import is_dataframe, is_classifier
-from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning
+from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning, YellowbrickValueError
 
 ##########################################################################
 ## Feature Visualizer
@@ -92,6 +92,10 @@ class FeatureImportances(ModelVisualizer):
         modified. If 'auto' (default), a helper method will check if the estimator
         is fitted before fitting it again.
 
+    topn : int, default=None
+        Display only the top N results with a positive integer, or the bottom N
+        results with a negative integer. If None or 0, all results are shown.
+
     kwargs : dict
         Keyword arguments that are passed to the base class and may influence
         the visualization as defined in other Visualizers.
@@ -128,6 +132,7 @@ def __init__(
         colors=None,
         colormap=None,
         is_fitted="auto",
+        topn=None,
         **kwargs
     ):
         # Initialize the visualizer bases
@@ -144,6 +149,7 @@ def __init__(
             stack=stack,
             colors=colors,
             colormap=colormap,
+            topn=topn
         )
 
     def fit(self, X, y=None, **kwargs):
@@ -218,12 +224,33 @@ def fit(self, X, y=None, **kwargs):
         else:
             self.features_ = np.array(self.labels)
 
+        if self.topn and self.topn > self.features_.shape[0]:
+            raise YellowbrickValueError(
+                "topn '{}' cannot be greater than the number of "
+                "features '{}'".format(self.topn, self.features_.shape[0])
+            )
+
         # Sort the features and their importances
         if self.stack:
-            sort_idx = np.argsort(np.mean(self.feature_importances_, 0))
+            if self.topn:
+                abs_sort_idx = np.argsort(
+                    np.sum(np.absolute(self.feature_importances_), 0)
+                )
+                sort_idx = self._reduce_topn(abs_sort_idx)
+            else:
+                sort_idx = np.argsort(np.mean(self.feature_importances_, 0))
+
             self.features_ = self.features_[sort_idx]
             self.feature_importances_ = self.feature_importances_[:, sort_idx]
         else:
+            if self.topn:
+                abs_sort_idx = np.argsort(np.absolute(self.feature_importances_))
+                abs_sort_idx = self._reduce_topn(abs_sort_idx)
+
+                self.features_ = self.features_[abs_sort_idx]
+                self.feature_importances_ = self.feature_importances_[abs_sort_idx]
+
+            # Sort features by value (sorting a second time if topn)
             sort_idx = np.argsort(self.feature_importances_)
             self.features_ = self.features_[sort_idx]
             self.feature_importances_ = self.feature_importances_[sort_idx]
@@ -276,7 +303,7 @@ def finalize(self, **kwargs):
         # Set the title
         self.set_title(
             "Feature Importances of {} Features using {}".format(
-                len(self.features_), self.name
+                self._get_topn_title(), self.name
             )
         )
 
@@ -346,6 +373,30 @@ def _is_fitted(self):
         """
         return hasattr(self, "feature_importances_") and hasattr(self, "features_")
 
+    def _reduce_topn(self, arr):
+        """
+        Return only the top or bottom N items within a sliceable array/list.
+
+        Assumes that arr is in ascending order.
+        """
+        if self.topn > 0:
+            arr = arr[-self.topn:]
+        elif self.topn < 0:
+            arr = arr[:-self.topn]
+        return arr
+
+    def _get_topn_title(self):
+        """
+        Return an appropriate title for the plot: Top N, Bottom N, or N
+        """
+        if self.topn:
+            if self.topn > 0:
+                return "Top {}".format(len(self.features_))
+            else:
+                return "Bottom {}".format(len(self.features_))
+        else:
+            return str(len(self.features_))
+
 
 ##########################################################################
 ## Quick Method
@@ -365,6 +416,7 @@ def feature_importances(
     colors=None,
     colormap=None,
     is_fitted="auto",
+    topn=None,
     show=True,
     **kwargs
 ):
@@ -431,6 +483,10 @@ def feature_importances(
         call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
         calls ``finalize()``
 
+    topn : int, default=None
+        Display only the top N results with a positive integer, or the bottom N
+        results with a negative integer. If None or 0, all results are shown.
+
     kwargs : dict
         Keyword arguments that are passed to the base class and may influence
         the visualization as defined in other Visualizers.
@@ -452,6 +508,7 @@ def feature_importances(
         colors=colors,
         colormap=colormap,
         is_fitted=is_fitted,
+        topn=topn,
         **kwargs
     )