Add PCA plot directives (PR #926)

This PR implements the plot directive for the PCA docs and removes files made unnecessary by plot directive.
DistrictDataLabs · Jul 20, 2019 · ae99abc · ae99abc
1 parent 9d7c317
commit ae99abc
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 115 deletions.
diff --git a/docs/api/features/images/pca_biplot_2d.png b/docs/api/features/images/pca_biplot_2d.png
diff --git a/docs/api/features/images/pca_biplot_3d.png b/docs/api/features/images/pca_biplot_3d.png
diff --git a/docs/api/features/images/pca_projection_2d.png b/docs/api/features/images/pca_projection_2d.png
diff --git a/docs/api/features/images/pca_projection_3d.png b/docs/api/features/images/pca_projection_3d.png
diff --git a/docs/api/features/pca.py b/docs/api/features/pca.py
diff --git a/docs/api/features/pca.rst b/docs/api/features/pca.rst
@@ -5,83 +5,75 @@ PCA Projection
 
 The PCA Decomposition visualizer utilizes principal component analysis to decompose high dimensional data into two or three dimensions so that each instance can be plotted in a scatter plot. The use of PCA means that the projected dataset can be analyzed along axes of principal variation and can be interpreted to determine if spherical distance metrics can be utilized.
 
-.. code:: python
+.. plot::
+    :context: close-figs
+    :alt: PCA Projection, 2D
 
-    # Load the classification data set
-    data = load_data('credit')
+    from yellowbrick.datasets import load_credit
+    from yellowbrick.features.pca import PCADecomposition
 
     # Specify the features of interest and the target
-    target = "default"
-    features = [col for col in data.columns if col != target]
-
-    # Extract the instance data and the target
-    X = data[features]
-    y = data[target]
+    X, y = load_credit()
 
     # Create a list of colors to assign to points in the plot
     colors = np.array(['r' if yi else 'b' for yi in y])
 
-.. code:: python
-
-    from yellowbrick.features.pca import PCADecomposition
-
     visualizer = PCADecomposition(scale=True, color=colors)
     visualizer.fit_transform(X, y)
     visualizer.poof()
 
 
-.. image:: images/pca_projection_2d.png
-
 The PCA projection can also be plotted in three dimensions to attempt to visualize more principal components and get a better sense of the distribution in high dimensions.
 
-.. code:: python
+.. plot::
+    :context: close-figs
+    :alt: PCA Projection, 3D
+
+    from yellowbrick.datasets import load_credit
+    from yellowbrick.features.pca import PCADecomposition
+
+    X, y = load_credit()
+
+    colors = np.array(['r' if yi else 'b' for yi in y])
 
     visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3)
     visualizer.fit_transform(X, y)
     visualizer.poof()
 
 
-.. image:: images/pca_projection_3d.png
-
 Biplot
 ------
 
-The PCA projection can be enhanced to a biplot whose points are the projected instances and whose vectors represent the structure of the data in high dimensional space. By using the ``proj_features=True`` flag, vectors for each feature in the dataset are drawn on the scatter plot in the direction of the maximum variance for that feature. These structures can be used to analyze the importance of a feature to the decomposition or to find features of related variance for further analysis.
-
-.. code:: python
+The PCA projection can be enhanced to a biplot whose points are the projected instances and whose vectors represent the structure of the data in high dimensional space. By using ``proj_features=True``, vectors for each feature in the dataset are drawn on the scatter plot in the direction of the maximum variance for that feature. These structures can be used to analyze the importance of a feature to the decomposition or to find features of related variance for further analysis.
 
-    # Load the classification data set
-    data = load_data('concrete')
+.. plot::
+    :context: close-figs
+    :alt: PCA biplot projection, 2D
 
-    # Specify the features of interest and the target
-    target = "strength"
-    features = [
-        'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'
-    ]
-
-    # Extract the instance data and the target
-    X = data[features]
-    y = data[target]
+    from yellowbrick.datasets import load_concrete
+    from yellowbrick.features.pca import PCADecomposition
 
-.. code:: python
+    # Load the concrete dataset
+    X, y = load_concrete()
 
     visualizer = PCADecomposition(scale=True, proj_features=True)
     visualizer.fit_transform(X, y)
     visualizer.poof()
 
 
-.. image:: images/pca_biplot_2d.png
+.. plot::
+    :context: close-figs
+    :alt: PCA biplot projection, 3D
 
-.. code:: python
+    from yellowbrick.datasets import load_concrete
+    from yellowbrick.features.pca import PCADecomposition
+
+    X, y = load_concrete()
 
     visualizer = PCADecomposition(scale=True, proj_features=True, proj_dim=3)
     visualizer.fit_transform(X, y)
     visualizer.poof()
 
-
-.. image:: images/pca_biplot_3d.png
-
-
 API Reference
 -------------
 

diff --git a/yellowbrick/features/pca.py b/yellowbrick/features/pca.py
@@ -36,24 +36,22 @@
 # 2D and 3D PCA Visualizer
 ##########################################################################
 
-
 class PCADecomposition(MultiFeatureVisualizer):
     """
     Produce a two or three dimensional principal component plot of a data array
-    projected onto it's largest sequential principal components. It is common
+    projected onto its largest sequential principal components. It is common
     practice to scale the data array ``X`` before applying a PC decomposition.
     Variable scaling can be controlled using the ``scale`` argument.
 
     Parameters
     ----------
     ax : matplotlib Axes, default: None
-        The axes to plot the figure on. If None is passed in the current axes.
+        The axes to plot the figure on. If None is passed in, the current axes 
         will be used (or generated if required).
 
-    features: list, default: None
-        a list of feature names to use
-        If a DataFrame is passed to fit and features is None, feature
-        names are selected as the columns of the DataFrame.
+    features : list, default: None
+        A list of feature names to use. If a DataFrame is passed to fit and features 
+        is None, feature names are selected as the columns of the DataFrame.
 
     scale : bool, default: True
         Boolean that indicates if user wants to scale data.
@@ -78,10 +76,10 @@ class PCADecomposition(MultiFeatureVisualizer):
         transparent. This property makes densely clustered points more visible.
 
     random_state : int, RandomState instance or None, optional (default None)
-        If input data is larger than 500x500 and the number of components to
-        extract is lower than 80% of the smallest dimension of the data, then
-        the more efficient `randomized` solver is enabled, this parameter sets
-        the random state on this solver.
+        This parameter sets the random state on this solver. If the input X is 
+        larger than 500x500 and the number of components to extract is lower 
+        than 80% of the smallest dimension of the data, then the more efficient 
+        `randomized` solver is enabled.
 
     colorbar : bool, default: False
         Add a colorbar to shows the range in magnitude of feature values to the
@@ -101,7 +99,7 @@ class PCADecomposition(MultiFeatureVisualizer):
     >>> X = iris.data
     >>> y = iris.target
     >>> visualizer = PCADecomposition()
-    >>> visualizer.fit_transform(X)
+    >>> visualizer.fit_transform(X, y)
     >>> visualizer.poof()
 
     """
@@ -369,7 +367,7 @@ def pca_decomposition(
 ):
     """
     Produce a two or three dimensional principal component plot of the data array ``X``
-    projected onto it's largest sequential principal components. It is common practice
+    projected onto its largest sequential principal components. It is common practice
     to scale the data array ``X`` before applying a PC decomposition. Variable scaling
     can be controlled using the ``scale`` argument.
 
@@ -382,13 +380,12 @@ def pca_decomposition(
         An array or series of target or class values.
 
     ax : matplotlib Axes, default: None
-        The axes to plot the figure on. If None is passed in the current axes.
+        The axes to plot the figure on. If None is passed in, the current axes 
         will be used (or generated if required).
 
-    features: list, default: None
-        a list of feature names to use
-        If a DataFrame is passed to fit and features is None, feature
-        names are selected as the columns of the DataFrame.
+    features : list, default: None
+        A list of feature names to use. If a DataFrame is passed to fit and 
+        features is None, feature names are selected as the columns of the DataFrame.
 
     scale : bool, default: True
         Boolean that indicates if user wants to scale data.
@@ -413,7 +410,7 @@ def pca_decomposition(
         transparent. This property makes densely clustered points more visible.
 
     random_state : int, RandomState instance or None, optional (default None)
-        If input data is larger than 500x500 and the number of components to
+        If input X is larger than 500x500 and the number of components to
         extract is lower than 80% of the smallest dimension of the data, then
         the more efficient `randomized` solver is enabled, this parameter sets
         the random state on this solver.