Update dispersion plot to allow title and color customization (#1136)

* update dispersion plot to allow title and color customization * small updates to dispersion tests * avoid sklearn 0.24 since it has build issues with some OSes * skip tests if no pandas
DistrictDataLabs · Dec 29, 2020 · babb521 · babb521
1 parent 749278d
commit babb521
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 62 deletions.
diff --git a/docs/api/text/dispersion.rst b/docs/api/text/dispersion.rst
@@ -3,7 +3,10 @@
 Dispersion Plot
 ===============
 
-A word's importance can be weighed by its dispersion in a corpus.  Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.  This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears.
+A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.
+
+Lexical dispersion illustrates the homogeneity of a word (or set of words) across
+the documents of a corpus. ``DispersionPlot`` allows for visualization of the lexical dispersion of words in a corpus. This plot illustrates with vertical lines the occurrences of one or more search terms throughout the corpus, noting how many words relative to the beginning of the corpus it appears.
 
 =================   ==============================
 Visualizer           :class:`~yellowbrick.text.dispersion.DispersionPlot`
@@ -33,6 +36,30 @@ Workflow             Feature Engineering
     visualizer.fit(text)
     visualizer.show()
 
+If the target vector of the corpus documents is provided, the points will be colored with respect to their document category, which allows for additional analysis of relationships in search term homogeneity within and across document categories.
+
+.. plot::
+    :context: close-figs
+    :alt: Dispersion Plot with Classes
+
+    from yellowbrick.text import DispersionPlot
+    from yellowbrick.datasets import load_hobbies
+
+    corpus = load_hobbies()
+    text = [doc.split() for doc in corpus.data]
+    y = corpus.target
+
+    target_words = ['points', 'money', 'score', 'win', 'reduce']
+
+    visualizer = DispersionPlot(
+        target_words,
+        colormap="Accent",
+        title="Lexical Dispersion Plot, Broken Down by Class"
+    )
+    visualizer.fit(text, y)
+    visualizer.show()
+
+
 Quick Method
 ------------
 
@@ -55,7 +82,7 @@ The same functionality above can be achieved with the associated quick method `d
     target_words = ['features', 'mobile', 'cooperative', 'competitive', 'combat', 'online']
 
     # Create the visualizer and draw the plot
-    dispersion(target_words, text)
+    dispersion(target_words, text, colors=['olive'])
 
 
 API Reference

diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,7 +1,7 @@
 # Library Dependencies
 matplotlib>=3.3
 scipy>=1.0.0
-scikit-learn>=0.20
+scikit-learn>=0.20,<0.24
 numpy>=1.13.0
 cycler>=0.10.0
 

diff --git a/tests/test_contrib/test_missing/test_bar.py b/tests/test_contrib/test_missing/test_bar.py
@@ -46,7 +46,7 @@ class TestMissingBarVisualizer(VisualTestCase):
     """
     FeatureImportances visualizer
     """
-
+    @pytest.mark.skipif(pd is None, reason="pandas is required")
     def test_missingvaluesbar_pandas(self):
         """
         Integration test of visualizer with pandas

diff --git a/tests/test_contrib/test_missing/test_dispersion.py b/tests/test_contrib/test_missing/test_dispersion.py
@@ -46,7 +46,7 @@ class TestMissingValuesDispersion(VisualTestCase):
     """
     MissingValuesDispersion visualizer
     """
-
+    @pytest.mark.skipif(pd is None, reason="pandas is required")
     def test_missingvaluesdispersion_with_pandas(self):
         """
         Integration test of visualizer with pandas
@@ -72,6 +72,7 @@ def test_missingvaluesdispersion_with_pandas(self):
 
         self.assert_images_similar(viz, tol=self.tol)
 
+    @pytest.mark.skipif(pd is None, reason="pandas is required")
     def test_missingvaluesdispersion_with_pandas_with_y_targets(self):
         """
         Integration test of visualizer with pandas with y targets

diff --git a/tests/test_text/test_dispersion.py b/tests/test_text/test_dispersion.py
@@ -45,9 +45,9 @@ def test_quick_method(self):
         _, ax = plt.subplots()
 
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        viz = dispersion(target_words=target_words, corpus=text, ax=ax, show=False)
+        viz = dispersion(search_terms=search_terms, corpus=text, ax=ax, show=False)
         viz.ax.grid(False)
 
         self.assert_images_similar(viz, tol=25)
@@ -57,9 +57,9 @@ def test_integrated_dispersion_plot(self):
         Assert no errors occur during DispersionPlot integration
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words)
+        visualizer = DispersionPlot(search_terms)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -71,9 +71,9 @@ def test_dispersion_plot_ignore_case(self):
         with ignore_case parameter turned on
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words, ignore_case=True)
+        visualizer = DispersionPlot(search_terms, ignore_case=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -85,9 +85,9 @@ def test_dispersion_plot_generator_input(self):
         when the corpus' text type is a generator
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words, ignore_case=True)
+        visualizer = DispersionPlot(search_terms, ignore_case=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -99,9 +99,9 @@ def test_dispersion_plot_annotate_docs(self):
         with annotate_docs parameter turned on
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words, annotate_docs=True)
+        visualizer = DispersionPlot(search_terms, annotate_docs=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -114,9 +114,9 @@ def test_dispersion_plot_color_by_class(self):
         """
         target = corpus.target
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words)
+        visualizer = DispersionPlot(search_terms)
         visualizer.fit(text, target)
         visualizer.ax.grid(False)
 
@@ -128,9 +128,9 @@ def test_dispersion_plot_mismatched_labels(self):
         """
         target = corpus.target
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"])
+        visualizer = DispersionPlot(search_terms, annotate_docs=True, labels=["a", "b"])
 
         msg = (
             r"number of supplied labels \(\d\) "