Skip to content

Commit

Permalink
Update dispersion plot to allow title and color customization (#1136)
Browse files Browse the repository at this point in the history
* update dispersion plot to allow title and color customization

* small updates to dispersion tests

* avoid sklearn 0.24 since it has build issues with some OSes

* skip tests if no pandas
  • Loading branch information
rebeccabilbro committed Dec 29, 2020
1 parent 749278d commit babb521
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 62 deletions.
31 changes: 29 additions & 2 deletions docs/api/text/dispersion.rst
Expand Up @@ -3,7 +3,10 @@
Dispersion Plot
===============

A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus. This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears.
A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.

Lexical dispersion illustrates the homogeneity of a word (or set of words) across
the documents of a corpus. ``DispersionPlot`` allows for visualization of the lexical dispersion of words in a corpus. This plot illustrates with vertical lines the occurrences of one or more search terms throughout the corpus, noting how many words relative to the beginning of the corpus it appears.

================= ==============================
Visualizer :class:`~yellowbrick.text.dispersion.DispersionPlot`
Expand Down Expand Up @@ -33,6 +36,30 @@ Workflow Feature Engineering
visualizer.fit(text)
visualizer.show()

If the target vector of the corpus documents is provided, the points will be colored with respect to their document category, which allows for additional analysis of relationships in search term homogeneity within and across document categories.

.. plot::
:context: close-figs
:alt: Dispersion Plot with Classes

from yellowbrick.text import DispersionPlot
from yellowbrick.datasets import load_hobbies

corpus = load_hobbies()
text = [doc.split() for doc in corpus.data]
y = corpus.target

target_words = ['points', 'money', 'score', 'win', 'reduce']

visualizer = DispersionPlot(
target_words,
colormap="Accent",
title="Lexical Dispersion Plot, Broken Down by Class"
)
visualizer.fit(text, y)
visualizer.show()


Quick Method
------------

Expand All @@ -55,7 +82,7 @@ The same functionality above can be achieved with the associated quick method `d
target_words = ['features', 'mobile', 'cooperative', 'competitive', 'combat', 'online']

# Create the visualizer and draw the plot
dispersion(target_words, text)
dispersion(target_words, text, colors=['olive'])


API Reference
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
@@ -1,7 +1,7 @@
# Library Dependencies
matplotlib>=3.3
scipy>=1.0.0
scikit-learn>=0.20
scikit-learn>=0.20,<0.24
numpy>=1.13.0
cycler>=0.10.0

Expand Down
2 changes: 1 addition & 1 deletion tests/test_contrib/test_missing/test_bar.py
Expand Up @@ -46,7 +46,7 @@ class TestMissingBarVisualizer(VisualTestCase):
"""
FeatureImportances visualizer
"""

@pytest.mark.skipif(pd is None, reason="pandas is required")
def test_missingvaluesbar_pandas(self):
"""
Integration test of visualizer with pandas
Expand Down
3 changes: 2 additions & 1 deletion tests/test_contrib/test_missing/test_dispersion.py
Expand Up @@ -46,7 +46,7 @@ class TestMissingValuesDispersion(VisualTestCase):
"""
MissingValuesDispersion visualizer
"""

@pytest.mark.skipif(pd is None, reason="pandas is required")
def test_missingvaluesdispersion_with_pandas(self):
"""
Integration test of visualizer with pandas
Expand All @@ -72,6 +72,7 @@ def test_missingvaluesdispersion_with_pandas(self):

self.assert_images_similar(viz, tol=self.tol)

@pytest.mark.skipif(pd is None, reason="pandas is required")
def test_missingvaluesdispersion_with_pandas_with_y_targets(self):
"""
Integration test of visualizer with pandas with y targets
Expand Down
28 changes: 14 additions & 14 deletions tests/test_text/test_dispersion.py
Expand Up @@ -45,9 +45,9 @@ def test_quick_method(self):
_, ax = plt.subplots()

text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

viz = dispersion(target_words=target_words, corpus=text, ax=ax, show=False)
viz = dispersion(search_terms=search_terms, corpus=text, ax=ax, show=False)
viz.ax.grid(False)

self.assert_images_similar(viz, tol=25)
Expand All @@ -57,9 +57,9 @@ def test_integrated_dispersion_plot(self):
Assert no errors occur during DispersionPlot integration
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words)
visualizer = DispersionPlot(search_terms)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -71,9 +71,9 @@ def test_dispersion_plot_ignore_case(self):
with ignore_case parameter turned on
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words, ignore_case=True)
visualizer = DispersionPlot(search_terms, ignore_case=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -85,9 +85,9 @@ def test_dispersion_plot_generator_input(self):
when the corpus' text type is a generator
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words, ignore_case=True)
visualizer = DispersionPlot(search_terms, ignore_case=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -99,9 +99,9 @@ def test_dispersion_plot_annotate_docs(self):
with annotate_docs parameter turned on
"""
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words, annotate_docs=True)
visualizer = DispersionPlot(search_terms, annotate_docs=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -114,9 +114,9 @@ def test_dispersion_plot_color_by_class(self):
"""
target = corpus.target
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words)
visualizer = DispersionPlot(search_terms)
visualizer.fit(text, target)
visualizer.ax.grid(False)

Expand All @@ -128,9 +128,9 @@ def test_dispersion_plot_mismatched_labels(self):
"""
target = corpus.target
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"])
visualizer = DispersionPlot(search_terms, annotate_docs=True, labels=["a", "b"])

msg = (
r"number of supplied labels \(\d\) "
Expand Down

0 comments on commit babb521

Please sign in to comment.