diff --git a/docs/api/cluster/elbow.rst b/docs/api/cluster/elbow.rst index 7284bdcc4..06c9afb4b 100644 --- a/docs/api/cluster/elbow.rst +++ b/docs/api/cluster/elbow.rst @@ -3,9 +3,9 @@ Elbow Method ============ -The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point. +The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point. In the visualizer "elbow" will be annotated with a dashed line. -To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see an "elbow" in the graph, which in this case we know to be the optimal number. +To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see a line annotating the "elbow" in the graph, which in this case we know to be the optimal number. .. plot:: :context: close-figs @@ -53,6 +53,31 @@ The ``KElbowVisualizer`` also displays the amount of time to train the clusterin visualizer.fit(X) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data +By default, the parameter ``locate_elbow`` is set to `True`, which automatically find the "elbow" which likely corresponds to the optimal value of k using the "knee point detection algorithm". However, users can turn off the feature by setting ``locate_elbow=False``.You can read about the implementation of this algorithm at `Knee point detection in Python `_ by Kevin Arvai. + +In the following example, we'll use the ``calinski_harabaz`` score and turn off ``locate_elbow`` feature. + +.. plot:: + :context: close-figs + :alt: KElbowVisualizer on synthetic dataset with 8 random clusters + + from sklearn.cluster import KMeans + from sklearn.datasets import make_blobs + + from yellowbrick.cluster import KElbowVisualizer + + # Generate synthetic dataset with 8 random clusters + X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42) + + # Instantiate the clustering model and visualizer + model = KMeans() + visualizer = KElbowVisualizer( + model, k=(4,12), metric='calinski_harabaz', timings=False, locate_elbow=False + ) + + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data + It is important to remember that the "elbow" method does not work well if the data is not very clustered. In this case, you might see a smooth curve and the optimal value of :math:`K` will be unclear. diff --git a/tests/test_cluster/test_elbow.py b/tests/test_cluster/test_elbow.py index 1a1a8f2fb..2f9ce13cd 100644 --- a/tests/test_cluster/test_elbow.py +++ b/tests/test_cluster/test_elbow.py @@ -217,7 +217,7 @@ def test_distortion_metric(self): Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, metric="distortion", timings=False + KMeans(random_state=0), k=5, metric="distortion", timings=False, locate_elbow=False ) visualizer.fit(X) @@ -236,7 +236,7 @@ def test_silhouette_metric(self): Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, metric="silhouette", timings=False + KMeans(random_state=0), k=5, metric="silhouette", timings=False, locate_elbow=False ) visualizer.fit(X) @@ -256,7 +256,7 @@ def test_calinski_harabaz_metric(self): """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, - metric="calinski_harabaz", timings=False + metric="calinski_harabaz", timings=False, locate_elbow=False ) visualizer.fit(X) assert len(visualizer.k_scores_) == 4 @@ -286,7 +286,7 @@ def test_timings(self): Test the twinx double axes with k-elbow timings """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, timings=True + KMeans(random_state=0), k=5, timings=True, locate_elbow=False ) visualizer.fit(X) diff --git a/yellowbrick/cluster/elbow.py b/yellowbrick/cluster/elbow.py index 65aa79aa1..39d0f934d 100644 --- a/yellowbrick/cluster/elbow.py +++ b/yellowbrick/cluster/elbow.py @@ -22,9 +22,12 @@ import time import numpy as np import scipy.sparse as sp +import warnings from .base import ClusteringScoreVisualizer -from ..exceptions import YellowbrickValueError +from ..style.palettes import LINE_COLOR +from ..exceptions import YellowbrickValueError, YellowbrickWarning +from ..utils import KneeLocator from sklearn.metrics import silhouette_score from sklearn.metrics import calinski_harabaz_score @@ -170,10 +173,31 @@ class KElbowVisualizer(ClusteringScoreVisualizer): Display the fitting time per k to evaluate the amount of time required to train the clustering model. + locate_elbow : bool, default: True + Automatically find the "elbow" or "knee" which likely corresponds to the optimal + value of k using the "knee point detection algorithm". The knee point detection + algorithm finds the point of maximum curvature, which in a well-behaved clustering + problem also represents the pivot of the elbow curve. The point is labeled with a + dashed line and annotated with the score and k values. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + k_scores_ : array of shape (n,) where n is no. of k values + The silhouette score corresponding to each k value. + + k_timers_ : array of shape (n,) where n is no. of k values + The time taken to fit n KMeans model corresponding to each k value. + + elbow_value_ : integer + The optimal value of k. + + elbow_score_ : float + The silhouette score corresponding to the optimal value of k. + Examples -------- @@ -194,6 +218,8 @@ class KElbowVisualizer(ClusteringScoreVisualizer): For a discussion on the Elbow method, read more at `Robert Gove's Block `_. + To know about 'Knee Point Detection Algorithm' read at `Finding a "kneedle" in a Haystack + `_. .. seealso:: The scikit-learn documentation for the `silhouette_score `_ and `calinski_harabaz_score @@ -206,7 +232,7 @@ class KElbowVisualizer(ClusteringScoreVisualizer): """ def __init__(self, model, ax=None, k=10, - metric="distortion", timings=True, **kwargs): + metric="distortion", timings=True, locate_elbow=True, **kwargs): super(KElbowVisualizer, self).__init__(model, ax=ax, **kwargs) # Get the scoring method @@ -218,7 +244,9 @@ def __init__(self, model, ax=None, k=10, # Store the arguments self.scoring_metric = KELBOW_SCOREMAP[metric] + self.metric = metric self.timings = timings + self.locate_elbow=locate_elbow # Convert K into a tuple argument if an integer if isinstance(k, int): @@ -241,13 +269,19 @@ def __init__(self, model, ax=None, k=10, def fit(self, X, y=None, **kwargs): """ Fits n KMeans models where n is the length of ``self.k_values_``, - storing the silhoutte scores in the ``self.k_scores_`` attribute. + storing the silhouette scores in the ``self.k_scores_`` attribute. + The "elbow" and silhouette score corresponding to it are stored in + ``self.elbow_value`` and ``self.elbow_score`` respectively. This method finishes up by calling draw to create the plot. """ self.k_scores_ = [] self.k_timers_ = [] + if self.locate_elbow: + self.elbow_value_ = None + self.elbow_score_ = None + for k in self.k_values_: # Compute the start time for each model start = time.time() @@ -260,7 +294,24 @@ def fit(self, X, y=None, **kwargs): self.k_timers_.append(time.time() - start) self.k_scores_.append( self.scoring_metric(X, self.estimator.labels_) - ) + ) + + if self.locate_elbow: + locator_kwargs = { + 'distortion': {'curve_nature': 'convex', 'curve_direction': 'decreasing'}, + 'silhouette': {'curve_nature': 'concave', 'curve_direction': 'increasing'}, + 'calinski_harabaz': {'curve_nature': 'concave', 'curve_direction': 'increasing'}, + }.get(self.metric, {}) + elbow_locator = KneeLocator(self.k_values_,self.k_scores_,**locator_kwargs) + self.elbow_value_ = elbow_locator.knee + if self.elbow_value_ == None: + warning_message=\ + "No 'knee' or 'elbow' point detected, " \ + "pass `locate_elbow=False` to remove the warning" + warnings.warn(warning_message,YellowbrickWarning) + else: + self.elbow_score_ = self.k_scores_[self.k_values_.index(self.elbow_value_)] + self.draw() @@ -271,8 +322,11 @@ def draw(self): Draw the elbow curve for the specified scores and values of K. """ # Plot the silhouette score against k - self.ax.plot(self.k_values_, self.k_scores_, marker="D", label="score") - + self.ax.plot(self.k_values_, self.k_scores_, marker="D") + if self.locate_elbow and self.elbow_value_!=None: + elbow_label = "$elbow\ at\ k={}, score={:0.3f}$".format(self.elbow_value_, self.elbow_score_) + self.ax.axvline(self.elbow_value_, c=LINE_COLOR, linestyle="--", label=elbow_label) + # If we're going to plot the timings, create a twinx axis if self.timings: self.axes = [self.ax, self.ax.twinx()] @@ -281,12 +335,14 @@ def draw(self): c='g', marker="o", linestyle="--", alpha=0.75, ) + return self.ax def finalize(self): """ Prepare the figure for rendering by setting the title as well as the X and Y axis labels and adding the legend. + """ # Get the metric name metric = self.scoring_metric.__name__.replace("_", " ").title() @@ -299,6 +355,10 @@ def finalize(self): # Set the x and y labels self.ax.set_xlabel('k') self.ax.set_ylabel(metric.lower()) + + #set the legend if locate_elbow=True + if self.locate_elbow and self.elbow_value_!=None: + self.ax.legend(loc='best', fontsize='medium') # Set the second y axis labels if self.timings: diff --git a/yellowbrick/utils/__init__.py b/yellowbrick/utils/__init__.py index 864802bf3..b1cadd903 100644 --- a/yellowbrick/utils/__init__.py +++ b/yellowbrick/utils/__init__.py @@ -22,3 +22,4 @@ from .helpers import * from .types import * +from .kneed import * diff --git a/yellowbrick/utils/kneed.py b/yellowbrick/utils/kneed.py new file mode 100644 index 000000000..1ec4eda5d --- /dev/null +++ b/yellowbrick/utils/kneed.py @@ -0,0 +1,242 @@ +# yellowbrick.utils.kneed +# A port of the knee-point detection package, kneed. +# +# Author: Kevin Arvai +# Author: Pradeep Singh +# Created: Mon Apr 15 09:43:18 2019 -0400 +# +# Copyright (C) 2017 Kevin Arvai +# All rights reserved. +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or other +# materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without specific +# prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ID: kneed.py [] pswaldia@no-reply.github.com $ + +""" +This package contains a port of the knee-point detection package, kneed, by +Kevin Arvai and hosted at https://github.com/arvkevi/kneed. This port is maintained +with permission by the Yellowbrick contributors. +""" +import numpy as np +from scipy import interpolate +from scipy.signal import argrelextrema +import warnings + +from yellowbrick.exceptions import YellowbrickWarning + + +class KneeLocator(object): + """ + Finds the "elbow" or "knee" which is a value corresponding to the point of maximum curvature + in an elbow curve, using knee point detection algorithm. This point is accessible via the + `knee` attribute. + + Parameters + ---------- + x : list + A list of k values representing the no. of clusters in KMeans Clustering algorithm. + + y : list + A list of silhouette score corresponding to each value of k. + + S : float, default: 1.0 + Sensitivity parameter that allows us to adjust how aggressive we want KneeLocator to + be when detecting "knees" or "elbows". + + curve_nature : string, default: 'convace' + A string that determines the nature of the elbow curve in which "knee" or "elbow" is + to be found. + + curve_direction : string, default: 'increasing' + A string that determines tha increasing or decreasing nature of the elbow curve in + which "knee" or "elbow" is to be found. + + Notes + ----- + The KneeLocator is implemented using the "knee point detection algorithm" which can be read at + `` + + """ + + def __init__(self, x, y, S=1.0, curve_nature='concave', curve_direction='increasing'): + + # Raw Input + self.x = x + self.y = y + self.curve_nature = curve_nature + self.curve_direction = curve_direction + self.N = len(self.x) + self.S = S + + # Step 1: fit a smooth line + uspline = interpolate.interp1d(self.x, self.y) + self.Ds_x = np.linspace(np.min(self.x), np.max(self.x), self.N) + self.Ds_y = uspline(self.Ds_x) + + # Step 2: normalize values + self.xsn = self.__normalize(self.Ds_x) + self.ysn = self.__normalize(self.Ds_y) + + # Step 3: Calculate difference curve + self.xd = self.xsn + if self.curve_nature == 'convex' and curve_direction == 'decreasing': + self.yd = self.ysn + self.xsn + self.yd = 1 - self.yd + elif self.curve_nature == 'concave' and curve_direction == 'decreasing': + self.yd = self.ysn + self.xsn + elif self.curve_nature == 'concave' and curve_direction == 'increasing': + self.yd = self.ysn - self.xsn + if self.curve_nature == 'convex' and curve_direction == 'increasing': + self.yd = abs(self.ysn - self.xsn) + + # Step 4: Identify local maxima/minima + # local maxima + self.xmx_idx = argrelextrema(self.yd, np.greater)[0] + self.xmx = self.xd[self.xmx_idx] + self.ymx = self.yd[self.xmx_idx] + + # local minima + self.xmn_idx = argrelextrema(self.yd, np.less)[0] + self.xmn = self.xd[self.xmn_idx] + self.ymn = self.yd[self.xmn_idx] + + # Step 5: Calculate thresholds + self.Tmx = self.__threshold(self.ymx) + + # Step 6: find knee + self.knee, self.norm_knee, self.knee_x = self.find_knee() + + @staticmethod + def __normalize(a): + """ + Normalizes an array. + + Parameters + ----------- + a : list + The array to normalize + """ + return (a - min(a)) / (max(a) - min(a)) + + def __threshold(self, ymx_i): + """ + Calculates the difference threshold for a + given difference local maximum. + + Parameters + ----------- + ymx_i : float + The normalized y value of a local maximum. + """ + return ymx_i - (self.S * np.diff(self.xsn).mean()) + + def find_knee(self, ): + """ + Finds and returns the "knee"or "elbow" value, the normalized knee + value, and the x value where the knee is located. + + """ + if not self.xmx_idx.size: + warning_message = \ + 'No "knee" or "elbow point" detected ' \ + 'This could be due to bad clustering, no '\ + 'actual clusters being formed etc.' + warnings.warn(warning_message,YellowbrickWarning) + return None, None, None + + mxmx_iter = np.arange(self.xmx_idx[0], len(self.xsn)) + xmx_idx_iter = np.append(self.xmx_idx, len(self.xsn)) + + knee_, norm_knee_, knee_x = 0.0, 0.0, None + for mxmx_i, mxmx in enumerate(xmx_idx_iter): + # stopping criteria for exhasuting array + if mxmx_i == len(xmx_idx_iter) - 1: + break + # indices between maxima/minima + idxs = (mxmx_iter > xmx_idx_iter[mxmx_i]) * \ + (mxmx_iter < xmx_idx_iter[mxmx_i + 1]) + between_local_mx = mxmx_iter[np.where(idxs)] + + for j in between_local_mx: + if j in self.xmn_idx: + # reached a minima, x indices are unique + # only need to check if j is a min + if self.yd[j + 1] > self.yd[j]: + self.Tmx[mxmx_i] = 0 + knee_x = None # reset x where yd crossed Tmx + elif self.yd[j + 1] <= self.yd[j]: + warning_message="If this is a minima, " \ + "how would you ever get here." + warnings.warn(warning_message, YellowbrickWarning) + if self.yd[j] < self.Tmx[mxmx_i] or self.Tmx[mxmx_i] < 0: + # declare a knee + if not knee_x: + knee_x = j + knee_ = self.x[self.xmx_idx[mxmx_i]] + norm_knee_ = self.xsn[self.xmx_idx[mxmx_i]] + return knee_, norm_knee_, knee_x + + def plot_knee_normalized(self, ): + """ + Plots the normalized curve, the distance curve (xd, ysn) and the + knee, if it exists. + + """ + import matplotlib.pyplot as plt + + plt.figure(figsize=(8, 8)) + plt.plot(self.xsn, self.ysn) + plt.plot(self.xd, self.yd, 'r') + plt.xticks(np.arange(min(self.xsn), max(self.xsn) + 0.1, 0.1)) + plt.yticks(np.arange(min(self.xd), max(self.ysn) + 0.1, 0.1)) + + plt.vlines(self.norm_knee, plt.ylim()[0], plt.ylim()[1]) + + def plot_knee(self, ): + """ + Plot the curve and the knee, if it exists + + """ + import matplotlib.pyplot as plt + + plt.figure(figsize=(8, 8)) + plt.plot(self.x, self.y) + plt.vlines(self.knee, plt.ylim()[0], plt.ylim()[1]) + + # Niceties for users working with elbows rather than knees + + @property + def elbow(self): + return self.knee + + @property + def norm_elbow(self): + return self.norm_knee + + @property + def elbow_x(self): + return self.knee_x + +