Merge 82ec05a into 749278d

DistrictDataLabs · Dec 29, 2020 · 7ec1a8e · 7ec1a8e
2 parents 749278d + 82ec05a
commit 7ec1a8e
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 60 deletions.
diff --git a/docs/api/text/dispersion.rst b/docs/api/text/dispersion.rst
@@ -3,7 +3,10 @@
 Dispersion Plot
 ===============
 
-A word's importance can be weighed by its dispersion in a corpus.  Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.  This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears.
+A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.
+
+Lexical dispersion illustrates the homogeneity of a word (or set of words) across
+the documents of a corpus. ``DispersionPlot`` allows for visualization of the lexical dispersion of words in a corpus. This plot illustrates with vertical lines the occurrences of one or more search terms throughout the corpus, noting how many words relative to the beginning of the corpus it appears.
 
 =================   ==============================
 Visualizer           :class:`~yellowbrick.text.dispersion.DispersionPlot`
@@ -33,6 +36,30 @@ Workflow             Feature Engineering
     visualizer.fit(text)
     visualizer.show()
 
+If the target vector of the corpus documents is provided, the points will be colored with respect to their document category, which allows for additional analysis of relationships in search term homogeneity within and across document categories.
+
+.. plot::
+    :context: close-figs
+    :alt: Dispersion Plot with Classes
+
+    from yellowbrick.text import DispersionPlot
+    from yellowbrick.datasets import load_hobbies
+
+    corpus = load_hobbies()
+    text = [doc.split() for doc in corpus.data]
+    y = corpus.target
+
+    target_words = ['points', 'money', 'score', 'win', 'reduce']
+
+    visualizer = DispersionPlot(
+        target_words,
+        colormap="Accent",
+        title="Lexical Dispersion Plot, Broken Down by Class"
+    )
+    visualizer.fit(text, y)
+    visualizer.show()
+
+
 Quick Method
 ------------
 
@@ -55,7 +82,7 @@ The same functionality above can be achieved with the associated quick method `d
     target_words = ['features', 'mobile', 'cooperative', 'competitive', 'combat', 'online']
 
     # Create the visualizer and draw the plot
-    dispersion(target_words, text)
+    dispersion(target_words, text, colors=['olive'])
 
 
 API Reference

diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,7 +1,7 @@
 # Library Dependencies
 matplotlib>=3.3
 scipy>=1.0.0
-scikit-learn>=0.20
+scikit-learn>=0.20,<0.24
 numpy>=1.13.0
 cycler>=0.10.0
 

diff --git a/tests/test_text/test_dispersion.py b/tests/test_text/test_dispersion.py
@@ -45,9 +45,9 @@ def test_quick_method(self):
         _, ax = plt.subplots()
 
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        viz = dispersion(target_words=target_words, corpus=text, ax=ax, show=False)
+        viz = dispersion(search_terms=search_terms, corpus=text, ax=ax, show=False)
         viz.ax.grid(False)
 
         self.assert_images_similar(viz, tol=25)
@@ -57,9 +57,9 @@ def test_integrated_dispersion_plot(self):
         Assert no errors occur during DispersionPlot integration
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words)
+        visualizer = DispersionPlot(search_terms)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -71,9 +71,9 @@ def test_dispersion_plot_ignore_case(self):
         with ignore_case parameter turned on
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words, ignore_case=True)
+        visualizer = DispersionPlot(search_terms, ignore_case=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -85,9 +85,9 @@ def test_dispersion_plot_generator_input(self):
         when the corpus' text type is a generator
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["Game", "player", "score", "oil", "Man"]
+        search_terms = ["Game", "player", "score", "oil", "Man"]
 
-        visualizer = DispersionPlot(target_words, ignore_case=True)
+        visualizer = DispersionPlot(search_terms, ignore_case=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -99,9 +99,9 @@ def test_dispersion_plot_annotate_docs(self):
         with annotate_docs parameter turned on
         """
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words, annotate_docs=True)
+        visualizer = DispersionPlot(search_terms, annotate_docs=True)
         visualizer.fit(text)
         visualizer.ax.grid(False)
 
@@ -114,9 +114,9 @@ def test_dispersion_plot_color_by_class(self):
         """
         target = corpus.target
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words)
+        visualizer = DispersionPlot(search_terms)
         visualizer.fit(text, target)
         visualizer.ax.grid(False)
 
@@ -128,9 +128,9 @@ def test_dispersion_plot_mismatched_labels(self):
         """
         target = corpus.target
         text = [doc.split() for doc in corpus.data]
-        target_words = ["girl", "she", "boy", "he", "man"]
+        search_terms = ["girl", "she", "boy", "he", "man"]
 
-        visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"])
+        visualizer = DispersionPlot(search_terms, annotate_docs=True, labels=["a", "b"])
 
         msg = (
             r"number of supplied labels \(\d\) "