Skip to content

Commit

Permalink
Merge 82ec05a into 749278d
Browse files Browse the repository at this point in the history
  • Loading branch information
rebeccabilbro committed Dec 29, 2020
2 parents 749278d + 82ec05a commit 7ec1a8e
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 60 deletions.
31 changes: 29 additions & 2 deletions docs/api/text/dispersion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
Dispersion Plot
===============

A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus. This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears.
A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus.

Lexical dispersion illustrates the homogeneity of a word (or set of words) across
the documents of a corpus. ``DispersionPlot`` allows for visualization of the lexical dispersion of words in a corpus. This plot illustrates with vertical lines the occurrences of one or more search terms throughout the corpus, noting how many words relative to the beginning of the corpus it appears.

================= ==============================
Visualizer :class:`~yellowbrick.text.dispersion.DispersionPlot`
Expand Down Expand Up @@ -33,6 +36,30 @@ Workflow Feature Engineering
visualizer.fit(text)
visualizer.show()

If the target vector of the corpus documents is provided, the points will be colored with respect to their document category, which allows for additional analysis of relationships in search term homogeneity within and across document categories.

.. plot::
:context: close-figs
:alt: Dispersion Plot with Classes

from yellowbrick.text import DispersionPlot
from yellowbrick.datasets import load_hobbies

corpus = load_hobbies()
text = [doc.split() for doc in corpus.data]
y = corpus.target

target_words = ['points', 'money', 'score', 'win', 'reduce']

visualizer = DispersionPlot(
target_words,
colormap="Accent",
title="Lexical Dispersion Plot, Broken Down by Class"
)
visualizer.fit(text, y)
visualizer.show()


Quick Method
------------

Expand All @@ -55,7 +82,7 @@ The same functionality above can be achieved with the associated quick method `d
target_words = ['features', 'mobile', 'cooperative', 'competitive', 'combat', 'online']

# Create the visualizer and draw the plot
dispersion(target_words, text)
dispersion(target_words, text, colors=['olive'])


API Reference
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Library Dependencies
matplotlib>=3.3
scipy>=1.0.0
scikit-learn>=0.20
scikit-learn>=0.20,<0.24
numpy>=1.13.0
cycler>=0.10.0

Expand Down
28 changes: 14 additions & 14 deletions tests/test_text/test_dispersion.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def test_quick_method(self):
_, ax = plt.subplots()

text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

viz = dispersion(target_words=target_words, corpus=text, ax=ax, show=False)
viz = dispersion(search_terms=search_terms, corpus=text, ax=ax, show=False)
viz.ax.grid(False)

self.assert_images_similar(viz, tol=25)
Expand All @@ -57,9 +57,9 @@ def test_integrated_dispersion_plot(self):
Assert no errors occur during DispersionPlot integration
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words)
visualizer = DispersionPlot(search_terms)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -71,9 +71,9 @@ def test_dispersion_plot_ignore_case(self):
with ignore_case parameter turned on
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words, ignore_case=True)
visualizer = DispersionPlot(search_terms, ignore_case=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -85,9 +85,9 @@ def test_dispersion_plot_generator_input(self):
when the corpus' text type is a generator
"""
text = [doc.split() for doc in corpus.data]
target_words = ["Game", "player", "score", "oil", "Man"]
search_terms = ["Game", "player", "score", "oil", "Man"]

visualizer = DispersionPlot(target_words, ignore_case=True)
visualizer = DispersionPlot(search_terms, ignore_case=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -99,9 +99,9 @@ def test_dispersion_plot_annotate_docs(self):
with annotate_docs parameter turned on
"""
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words, annotate_docs=True)
visualizer = DispersionPlot(search_terms, annotate_docs=True)
visualizer.fit(text)
visualizer.ax.grid(False)

Expand All @@ -114,9 +114,9 @@ def test_dispersion_plot_color_by_class(self):
"""
target = corpus.target
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words)
visualizer = DispersionPlot(search_terms)
visualizer.fit(text, target)
visualizer.ax.grid(False)

Expand All @@ -128,9 +128,9 @@ def test_dispersion_plot_mismatched_labels(self):
"""
target = corpus.target
text = [doc.split() for doc in corpus.data]
target_words = ["girl", "she", "boy", "he", "man"]
search_terms = ["girl", "she", "boy", "he", "man"]

visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"])
visualizer = DispersionPlot(search_terms, annotate_docs=True, labels=["a", "b"])

msg = (
r"number of supplied labels \(\d\) "
Expand Down

0 comments on commit 7ec1a8e

Please sign in to comment.