DistrictDataLabs · bbengfort · Apr 22, 2019 · Mar 30, 2019 · Mar 30, 2019 · Apr 10, 2019
diff --git a/docs/api/cluster/elbow.rst b/docs/api/cluster/elbow.rst
@@ -3,9 +3,9 @@
 Elbow Method
 ============
 
-The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point.
+The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point. In the visualizer "elbow" will be annotated with a dashed line.
 
-To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see an "elbow" in the graph, which in this case we know to be the optimal number.
+To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see a line annotating the "elbow" in the graph, which in this case we know to be the optimal number.
 
 .. plot::
     :context: close-figs
@@ -53,6 +53,31 @@ The ``KElbowVisualizer`` also displays the amount of time to train the clusterin
     visualizer.fit(X)        # Fit the data to the visualizer
     visualizer.poof()        # Draw/show/poof the data
 
+By default, the parameter ``locate_elbow`` is set to `True`, which automatically find the "elbow" which likely corresponds to the optimal value of k using the "knee point detection algorithm". However, users can turn off the feature by setting ``locate_elbow=False``.You can read about the implementation of this algorithm at `Knee point detection in Python <https://github.com/arvkevi/kneed>`_ by Kevin Arvai. 
+
+In the following example, we'll use the ``calinski_harabaz`` score and turn off ``locate_elbow`` feature.
+
+.. plot::
+    :context: close-figs
+    :alt: KElbowVisualizer on synthetic dataset with 8 random clusters
+
+    from sklearn.cluster import KMeans
+    from sklearn.datasets import make_blobs
+
+    from yellowbrick.cluster import KElbowVisualizer
+
+    # Generate synthetic dataset with 8 random clusters
+    X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42)
+
+    # Instantiate the clustering model and visualizer
+    model = KMeans()
+    visualizer = KElbowVisualizer(
+        model, k=(4,12), metric='calinski_harabaz', timings=False, locate_elbow=False
+    )
+
+    visualizer.fit(X)        # Fit the data to the visualizer
+    visualizer.poof()        # Draw/show/poof the data
+
 It is important to remember that the "elbow" method does not work well if the data
 is not very clustered. In this case, you might see a smooth curve and the optimal value of :math:`K` will be unclear.
 

diff --git a/tests/test_cluster/test_elbow.py b/tests/test_cluster/test_elbow.py
@@ -217,7 +217,7 @@ def test_distortion_metric(self):
         Test the distortion metric of the k-elbow visualizer
         """
         visualizer = KElbowVisualizer(
-            KMeans(random_state=0), k=5, metric="distortion", timings=False
+            KMeans(random_state=0), k=5, metric="distortion", timings=False, locate_elbow=False
         )
         visualizer.fit(X)
 
@@ -236,7 +236,7 @@ def test_silhouette_metric(self):
         Test the silhouette metric of the k-elbow visualizer
         """
         visualizer = KElbowVisualizer(
-            KMeans(random_state=0), k=5, metric="silhouette", timings=False
+            KMeans(random_state=0), k=5, metric="silhouette", timings=False, locate_elbow=False
         )
         visualizer.fit(X)
 
@@ -256,7 +256,7 @@ def test_calinski_harabaz_metric(self):
         """
         visualizer = KElbowVisualizer(
             KMeans(random_state=0), k=5,
-            metric="calinski_harabaz", timings=False
+            metric="calinski_harabaz", timings=False, locate_elbow=False
         )
         visualizer.fit(X)
         assert len(visualizer.k_scores_) == 4
@@ -286,7 +286,7 @@ def test_timings(self):
         Test the twinx double axes with k-elbow timings
         """
         visualizer = KElbowVisualizer(
-            KMeans(random_state=0), k=5, timings=True
+            KMeans(random_state=0), k=5, timings=True, locate_elbow=False
         )
         visualizer.fit(X)
 

diff --git a/yellowbrick/cluster/elbow.py b/yellowbrick/cluster/elbow.py
@@ -22,9 +22,12 @@
 import time
 import numpy as np
 import scipy.sparse as sp
+import warnings
 
 from .base import ClusteringScoreVisualizer
-from ..exceptions import YellowbrickValueError
+from ..style.palettes import LINE_COLOR
+from ..exceptions import YellowbrickValueError, YellowbrickWarning
+from ..utils import KneeLocator
 
 from sklearn.metrics import silhouette_score
 from sklearn.metrics import calinski_harabaz_score
@@ -170,10 +173,31 @@ class KElbowVisualizer(ClusteringScoreVisualizer):
         Display the fitting time per k to evaluate the amount of time required
         to train the clustering model.
 
+    locate_elbow : bool, default: True 
+        Automatically find the "elbow" or "knee" which likely corresponds to the optimal 
+        value of k using the "knee point detection algorithm". The knee point detection 
+        algorithm finds the point of maximum curvature, which in a well-behaved clustering
+        problem also represents the pivot of the elbow curve. The point is labeled with a
+        dashed line and annotated with the score and k values.      
+
     kwargs : dict
         Keyword arguments that are passed to the base class and may influence
         the visualization as defined in other Visualizers.
 
+    Attributes
+    ----------
+    k_scores_ : array of shape (n,) where n is no. of k values
+        The silhouette score corresponding to each k value.
+
+    k_timers_ : array of shape (n,) where n is no. of k values
+        The time taken to fit n KMeans model corresponding to each k value.
+
+    elbow_value_ : integer
+        The optimal value of k.
+
+    elbow_score_ : float
+        The silhouette score corresponding to the optimal value of k.         
+
     Examples
     --------
 
@@ -194,6 +218,8 @@ class KElbowVisualizer(ClusteringScoreVisualizer):
 
     For a discussion on the Elbow method, read more at
     `Robert Gove's Block <https://bl.ocks.org/rpgove/0060ff3b656618e9136b>`_.
+    To know about 'Knee Point Detection Algorithm' read at `Finding a "kneedle" in a Haystack
+    <https://raghavan.usc.edu//papers/kneedle-simplex11.pdf>`_.  
 
     .. seealso:: The scikit-learn documentation for the `silhouette_score
         <https://bit.ly/2LYWjYb>`_ and `calinski_harabaz_score
@@ -206,7 +232,7 @@ class KElbowVisualizer(ClusteringScoreVisualizer):
     """
 
     def __init__(self, model, ax=None, k=10,
-                 metric="distortion", timings=True, **kwargs):
+                 metric="distortion", timings=True, locate_elbow=True, **kwargs):
         super(KElbowVisualizer, self).__init__(model, ax=ax, **kwargs)
 
         # Get the scoring method
@@ -218,7 +244,9 @@ def __init__(self, model, ax=None, k=10,
 
         # Store the arguments
         self.scoring_metric = KELBOW_SCOREMAP[metric]
+        self.metric = metric
         self.timings = timings
+        self.locate_elbow=locate_elbow
 
         # Convert K into a tuple argument if an integer
         if isinstance(k, int):
@@ -241,13 +269,19 @@ def __init__(self, model, ax=None, k=10,
     def fit(self, X, y=None, **kwargs):
         """
         Fits n KMeans models where n is the length of ``self.k_values_``,
-        storing the silhoutte scores in the ``self.k_scores_`` attribute.
+        storing the silhouette scores in the ``self.k_scores_`` attribute.
+        The "elbow" and silhouette score corresponding to it are stored in
+        ``self.elbow_value`` and ``self.elbow_score`` respectively.
         This method finishes up by calling draw to create the plot.
         """
 
         self.k_scores_ = []
         self.k_timers_ = []
 
+        if self.locate_elbow:
+            self.elbow_value_ = None
+            self.elbow_score_ = None
+
         for k in self.k_values_:
             # Compute the start time for each  model
             start = time.time()
@@ -260,7 +294,24 @@ def fit(self, X, y=None, **kwargs):
             self.k_timers_.append(time.time() - start)
             self.k_scores_.append(
                 self.scoring_metric(X, self.estimator.labels_)
-            )
+            )    
+
+        if self.locate_elbow:
+            locator_kwargs = {
+                'distortion': {'curve_nature': 'convex', 'curve_direction': 'decreasing'},
+                'silhouette': {'curve_nature': 'concave', 'curve_direction': 'increasing'},
+                'calinski_harabaz': {'curve_nature': 'concave', 'curve_direction': 'increasing'},
+                }.get(self.metric, {})   
+            elbow_locator = KneeLocator(self.k_values_,self.k_scores_,**locator_kwargs)
+            self.elbow_value_ = elbow_locator.knee
+            if self.elbow_value_ == None:
+                warning_message=\
+                "No 'knee' or 'elbow' point detected, " \
+                "pass `locate_elbow=False` to remove the warning"   
+                warnings.warn(warning_message,YellowbrickWarning) 
+            else:
+                self.elbow_score_ = self.k_scores_[self.k_values_.index(self.elbow_value_)]
+
 
         self.draw()
 
@@ -271,8 +322,11 @@ def draw(self):
         Draw the elbow curve for the specified scores and values of K.
         """
         # Plot the silhouette score against k
-        self.ax.plot(self.k_values_, self.k_scores_, marker="D", label="score")
-
+        self.ax.plot(self.k_values_, self.k_scores_, marker="D")
+        if self.locate_elbow and self.elbow_value_!=None:
+            elbow_label = "$elbow\ at\ k={}, score={:0.3f}$".format(self.elbow_value_, self.elbow_score_)
+            self.ax.axvline(self.elbow_value_, c=LINE_COLOR, linestyle="--", label=elbow_label)
+
         # If we're going to plot the timings, create a twinx axis
         if self.timings:
             self.axes = [self.ax, self.ax.twinx()]
@@ -281,12 +335,14 @@ def draw(self):
                 c='g', marker="o", linestyle="--", alpha=0.75,
             )
 
+
         return self.ax
 
     def finalize(self):
         """
         Prepare the figure for rendering by setting the title as well as the
         X and Y axis labels and adding the legend.
+
         """
         # Get the metric name
         metric = self.scoring_metric.__name__.replace("_", " ").title()
@@ -299,6 +355,10 @@ def finalize(self):
         # Set the x and y labels
         self.ax.set_xlabel('k')
         self.ax.set_ylabel(metric.lower())
+
+        #set the legend if locate_elbow=True
+        if self.locate_elbow and self.elbow_value_!=None:
+            self.ax.legend(loc='best', fontsize='medium')
 
         # Set the second y axis labels
         if self.timings:

diff --git a/yellowbrick/utils/__init__.py b/yellowbrick/utils/__init__.py
@@ -22,3 +22,4 @@
 
 from .helpers import *
 from .types import *
+from .kneed import *