From a46e70c94e0ed96cc3160d370df35d3bc0bc7888 Mon Sep 17 00:00:00 2001 From: Michael Garod Date: Mon, 26 Oct 2020 10:30:04 -0400 Subject: [PATCH] Add param to limit Feature Importances to top_n (#1102) Feature importances now visualizes the top or bottom n features (using a negative argument), ensuring that the absolute magnitude of the features are respected during selection. The filter works for relative, real, and stacked values of the importance chart. Thank you to @mgarod for this excellent contribution! Co-authored-by: Michael Garod --- docs/api/model_selection/importances.rst | 44 ++++++++++++ .../test_importances/test_topn.png | Bin 0 -> 3493 bytes .../test_importances/test_topn_negative.png | Bin 0 -> 3449 bytes .../test_topn_negative_stacked.png | Bin 0 -> 3581 bytes .../test_importances/test_topn_stacked.png | Bin 0 -> 3597 bytes .../test_model_selection/test_importances.py | 66 ++++++++++++++++++ yellowbrick/model_selection/importances.py | 63 ++++++++++++++++- 7 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 tests/baseline_images/test_model_selection/test_importances/test_topn.png create mode 100644 tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png create mode 100644 tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png create mode 100644 tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png diff --git a/docs/api/model_selection/importances.rst b/docs/api/model_selection/importances.rst index 325bf337c..d2f58441f 100644 --- a/docs/api/model_selection/importances.rst +++ b/docs/api/model_selection/importances.rst @@ -111,6 +111,50 @@ Taking the mean of the importances may be undesirable for several reasons. For e viz.fit(X, y) viz.show() +Top and Bottom Feature Importances +---------------------------------- + +It may be more illuminating to the feature engineering process to identify the most or least informative features. To view only the N most informative features, specify the ``topn`` argument to the visualizer. Similar to slicing a ranked list by their importance, if ``topn`` is a postive integer, then the most highly ranked features are used. If ``topn`` is a negative integer, then the lowest ranked features are displayed instead. + +.. plot:: + :context: close-figs + :alt: Coefficient importances for LASSO regression + + from sklearn.linear_model import Lasso + from yellowbrick.datasets import load_concrete + from yellowbrick.model_selection import FeatureImportances + + # Load the regression dataset + dataset = load_concrete(return_dataset=True) + X, y = dataset.to_data() + + # Title case the feature for better display and create the visualizer + labels = list(map(lambda s: s.title(), dataset.meta['features'])) + viz = FeatureImportances(Lasso(), labels=labels, relative=False, topn=3) + + # Fit and show the feature importances + viz.fit(X, y) + viz.show() + +Using ``topn=3``, we can identify the three most informative features in the concrete dataset as ``splast``, ``cement``, and ``water``. This approach to visualization may assist with *factor analysis* - the study of how variables contribute to an overall model. Note that although ``water`` has a negative coefficient, it is the magnitude (absolute value) of the feature that matters since we are closely inspecting the negative correlation of ``water`` with the strength of concrete. Alternatively, ``topn=-3`` would reveal the three least informative features in the model. This approach is useful to model tuning similar to :doc:`rfecv`, but instead of automatically removing features, it would allow you to identify the lowest-ranked features as they change in different model instantiations. In either case, if you have many features, using ``topn`` can significantly increase the visual and analytical capacity of your analysis. + +The ``topn`` parameter can also be used when ``stacked=True``. In the context of stacked feature importance graphs, the information of a feature is the width of the entire bar, or the sum of the absolute value of all coefficients contained therein. + +.. plot:: + :context: close-figs + :alt: Stacked per-class importances with Logistic Regression + + from yellowbrick.model_selection import FeatureImportances + from sklearn.linear_model import LogisticRegression + from sklearn.datasets import load_iris + + data = load_iris() + X, y = data.data, data.target + + model = LogisticRegression(multi_class="auto", solver="liblinear") + viz = FeatureImportances(model, stack=True, relative=False, topn=-3) + viz.fit(X, y) + viz.show() Discussion ---------- diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn.png b/tests/baseline_images/test_model_selection/test_importances/test_topn.png new file mode 100644 index 0000000000000000000000000000000000000000..75fe59b7ee62f5b38e5f010cb4edc349fb19b317 GIT binary patch literal 3493 zcmeAS@N?(olHy`uVBq!ia0y~yU{+vYU{d2?1B$GUdv_2>S(dm)lmzFem6RtIr84*? zmK5aVm*iw7DU_ua6=&w>8S5GA8R;lwl#~<{Tj}fP!WHP{7p3d#cL{O>4dE>Ch%9Dc z;7X!f5{HLq(=Z(yH&ntmevB<5h z1yUdWD>^VRH1seqGB6l1u`n>4;1FP7P*5I9f@jj+GXHyfv|BuR-t&!j^RC;mGB8La z-?@9YH*?mp&FSaw{oH)+9c#+G=lkm)M`Igzib@ukV#V5Qi zv+g_oxZ>BBmp9&)WzRp{Y9Ygyy#M}fUj8I=okTDy^xM62?_S%R>2vRLa{zi6jBR=FJZAhoM5#{*VTlsgK|8j8h$A|0p_ukFh{!KHcTU*tc(+ z^u1YkH?)A!k=b|mV(FKEomzqYTzJH`C@`vfX!YEiX{yYd?JB{rAVee%;Ee z$Sz}Mu+^tll|ghydj7$o8@%2SH0N^OvP`R+-#>4zW(2mh@5TRPAI(AJ_A}}k>(ZV_A9tE=4lIip NJYD@<);T3K0RTL|vPb{` literal 0 HcmV?d00001 diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative.png new file mode 100644 index 0000000000000000000000000000000000000000..4983b418eb70e278dc53e36bd2d07e89f1d15b2b GIT binary patch literal 3449 zcmeAS@N?(olHy`uVBq!ia0y~yU{+vYU{d2?1B$GUdv_2>S(dm)lmzFem6RtIr84*? zmK5aVm*iw7DU_ua6=&w>8S5GA8R;lwl#~<{Tj}fP!WHP{7p3d#cL{O>4dE>Ch%9Dc z;7Pz5b*2r%w|z=RL0k+QSmJwiZZz zDB=)cU{FwYU|?wIVPIrnFk)h1U^p?P1UE{v&!~FWUZO$j}zki!)+X3e2ghXzgoP~ua_Mh*6$C|SHZr|TJ zdq02w<9XYof6IiGl$V?5<>lotMn7oO-LQdGY}_Rp-ZGrw2&?c}ZW0+oB~Y8l)Qi$&aE;*L)E7_7pxCr?t|8e0I{kDyu)*c8j~xm>aV+=2u*3jgra-WODEY0{%eB&wPGk7=p! WEd}Q~r8Ho9$>8bg=d#Wzp$Py8R(aR} literal 0 HcmV?d00001 diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_negative_stacked.png new file mode 100644 index 0000000000000000000000000000000000000000..4d81f699783ad9faeedb2f57d7ffc8b7fc0b8eee GIT binary patch literal 3581 zcmeAS@N?(olHy`uVBq!ia0y~yU{+vYU{d2?1B$GUdv_2>S(dm)lmzFem6RtIr84*? zmK5aVm*iw7DU_ua6=&w>8S5GA8R;lwl#~<{Tj}fP!WHP{7p3d#cL{O>4dE>Ch%9Dc z;7c_Rsl3wV<JWMD92Vqsu7F{A{G&8iRhu8-R*b$eUy=g*&S z1~V{NBwoLMJvnmP$0sKz-wgg?Kj#kjiEX*bx3}Fiw&MaSTfJI)fBwBPB2ImD%^hxlDY{zF)hWc0arud*J^g*1=b-0KGD&Yx?}!J^Q<^Z)P_*Ch_>|{rxj- zAMQQfeUsb5H2(jGjGm8a-+27ay}f(Hzy9B+Yjt7;xArtJFfd3mGBGfuuy8OiOb`SX z01j@Wf};UKd1A1fZTld^=J&_9Z#Z|B#vXrIAo2Te-J7!AH;e76&ZqH96RP&w;|>I%j|S7V>K`G&FJ{@y=t-u*SMwykDRf8JmJ>$Sw=vu`B#pNzNLA1(gp z(a-YD>I_RpGty`wLaPE4n2{LX?M}DfH`n%|c{H#-;%nb{|NZg2?Xzv$g*UEy{iA<3 z!-Hw`$Ud8MZf`p)_c2>sTpZZ^{P^Ht^UdIQdvk9e`}*~(g}uG}=}l+Qo0hwFNuA!* zv)q5a+Wg~hEr3OWT71gf)W{aO|3`fbdUiihGe2l({4Oa{tls*bqxyFyV99$Y{vSWB g!gBamaCK~(ML$=_bAG!GtlAkoUHx3vIVCg!04zzvwEzGB literal 0 HcmV?d00001 diff --git a/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_topn_stacked.png new file mode 100644 index 0000000000000000000000000000000000000000..99151181176957a38290bd253e70e986175c0f3d GIT binary patch literal 3597 zcmeAS@N?(olHy`uVBq!ia0y~yU{+vYU{d2?1B$GUdv_2>S(dm)lmzFem6RtIr84*? zmK5aVm*iw7DU_ua6=&w>8S5GA8R;lwl#~<{Tj}fP!WHP{7p3d#cL{O>4dE>Ch%9Dc z;7s|wGmL96ckg`}^PMVdFp|FW z^W&epKk}@0i%*yzZ#U1r-mdawYUH$3nf~J+zr4J>@&5bt^AB5)mFMN>r~5z91)9sj zz)-N01xQX1RA69maBBdDsw5*5149bc1fOMnW6--j|KG>2Z#bpD{e8p8vhUaHyF2#I z-22F6UhR*cZ#dJR=Y3-kdjDx|{NA65-&pQ3#tqT99MTDKgg73U-wvey{@C50=6_>O zYUHtx6*BFI6W@IQeY4nN;s5);{+{0Zb@ur*eGBP**Ka>Km(waf_(~U$AAbJ2UjNUs znrZ*xw`Vi!|Eb@%v3|?`VfLTzr}y{ozNvg-$JeLT@9*C~JC8yAIqQ%wB}Yi&9D3e7 zbM+15ob~!**h&&^7#I}^`D}Rsd4?iKL*u@JatbS^&Nj~ zA;XuvAD9M_;5)^dof0P%}ZT?>B<`ONxt)ua(OB&$F3n z`*hKo*H&_Ja>u@Y{d(hV+2-d3PI-G}!HD self.features_.shape[0]: + raise YellowbrickValueError( + "topn '{}' cannot be greater than the number of " + "features '{}'".format(self.topn, self.features_.shape[0]) + ) + # Sort the features and their importances if self.stack: - sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) + if self.topn: + abs_sort_idx = np.argsort( + np.sum(np.absolute(self.feature_importances_), 0) + ) + sort_idx = self._reduce_topn(abs_sort_idx) + else: + sort_idx = np.argsort(np.mean(self.feature_importances_, 0)) + self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[:, sort_idx] else: + if self.topn: + abs_sort_idx = np.argsort(np.absolute(self.feature_importances_)) + abs_sort_idx = self._reduce_topn(abs_sort_idx) + + self.features_ = self.features_[abs_sort_idx] + self.feature_importances_ = self.feature_importances_[abs_sort_idx] + + # Sort features by value (sorting a second time if topn) sort_idx = np.argsort(self.feature_importances_) self.features_ = self.features_[sort_idx] self.feature_importances_ = self.feature_importances_[sort_idx] @@ -276,7 +303,7 @@ def finalize(self, **kwargs): # Set the title self.set_title( "Feature Importances of {} Features using {}".format( - len(self.features_), self.name + self._get_topn_title(), self.name ) ) @@ -346,6 +373,30 @@ def _is_fitted(self): """ return hasattr(self, "feature_importances_") and hasattr(self, "features_") + def _reduce_topn(self, arr): + """ + Return only the top or bottom N items within a sliceable array/list. + + Assumes that arr is in ascending order. + """ + if self.topn > 0: + arr = arr[-self.topn:] + elif self.topn < 0: + arr = arr[:-self.topn] + return arr + + def _get_topn_title(self): + """ + Return an appropriate title for the plot: Top N, Bottom N, or N + """ + if self.topn: + if self.topn > 0: + return "Top {}".format(len(self.features_)) + else: + return "Bottom {}".format(len(self.features_)) + else: + return str(len(self.features_)) + ########################################################################## ## Quick Method @@ -365,6 +416,7 @@ def feature_importances( colors=None, colormap=None, is_fitted="auto", + topn=None, show=True, **kwargs ): @@ -431,6 +483,10 @@ def feature_importances( call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply calls ``finalize()`` + topn : int, default=None + Display only the top N results with a positive integer, or the bottom N + results with a negative integer. If None or 0, all results are shown. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -452,6 +508,7 @@ def feature_importances( colors=colors, colormap=colormap, is_fitted=is_fitted, + topn=topn, **kwargs )