From babb521fffa0c73d3a99972a651bbc6b887d519c Mon Sep 17 00:00:00 2001 From: Rebecca Bilbro Date: Tue, 29 Dec 2020 11:28:56 -0500 Subject: [PATCH] Update dispersion plot to allow title and color customization (#1136) * update dispersion plot to allow title and color customization * small updates to dispersion tests * avoid sklearn 0.24 since it has build issues with some OSes * skip tests if no pandas --- docs/api/text/dispersion.rst | 31 +++- tests/requirements.txt | 2 +- tests/test_contrib/test_missing/test_bar.py | 2 +- .../test_missing/test_dispersion.py | 3 +- tests/test_text/test_dispersion.py | 28 ++-- yellowbrick/text/dispersion.py | 143 ++++++++++++------ 6 files changed, 147 insertions(+), 62 deletions(-) diff --git a/docs/api/text/dispersion.rst b/docs/api/text/dispersion.rst index cc71940dd..509568a1c 100644 --- a/docs/api/text/dispersion.rst +++ b/docs/api/text/dispersion.rst @@ -3,7 +3,10 @@ Dispersion Plot =============== -A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus. This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears. +A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus. + +Lexical dispersion illustrates the homogeneity of a word (or set of words) across +the documents of a corpus. ``DispersionPlot`` allows for visualization of the lexical dispersion of words in a corpus. This plot illustrates with vertical lines the occurrences of one or more search terms throughout the corpus, noting how many words relative to the beginning of the corpus it appears. ================= ============================== Visualizer :class:`~yellowbrick.text.dispersion.DispersionPlot` @@ -33,6 +36,30 @@ Workflow Feature Engineering visualizer.fit(text) visualizer.show() +If the target vector of the corpus documents is provided, the points will be colored with respect to their document category, which allows for additional analysis of relationships in search term homogeneity within and across document categories. + +.. plot:: + :context: close-figs + :alt: Dispersion Plot with Classes + + from yellowbrick.text import DispersionPlot + from yellowbrick.datasets import load_hobbies + + corpus = load_hobbies() + text = [doc.split() for doc in corpus.data] + y = corpus.target + + target_words = ['points', 'money', 'score', 'win', 'reduce'] + + visualizer = DispersionPlot( + target_words, + colormap="Accent", + title="Lexical Dispersion Plot, Broken Down by Class" + ) + visualizer.fit(text, y) + visualizer.show() + + Quick Method ------------ @@ -55,7 +82,7 @@ The same functionality above can be achieved with the associated quick method `d target_words = ['features', 'mobile', 'cooperative', 'competitive', 'combat', 'online'] # Create the visualizer and draw the plot - dispersion(target_words, text) + dispersion(target_words, text, colors=['olive']) API Reference diff --git a/tests/requirements.txt b/tests/requirements.txt index 8eb66e424..12adc9da2 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,7 +1,7 @@ # Library Dependencies matplotlib>=3.3 scipy>=1.0.0 -scikit-learn>=0.20 +scikit-learn>=0.20,<0.24 numpy>=1.13.0 cycler>=0.10.0 diff --git a/tests/test_contrib/test_missing/test_bar.py b/tests/test_contrib/test_missing/test_bar.py index e256006f8..21f31477c 100644 --- a/tests/test_contrib/test_missing/test_bar.py +++ b/tests/test_contrib/test_missing/test_bar.py @@ -46,7 +46,7 @@ class TestMissingBarVisualizer(VisualTestCase): """ FeatureImportances visualizer """ - + @pytest.mark.skipif(pd is None, reason="pandas is required") def test_missingvaluesbar_pandas(self): """ Integration test of visualizer with pandas diff --git a/tests/test_contrib/test_missing/test_dispersion.py b/tests/test_contrib/test_missing/test_dispersion.py index 7db9a525a..a1364a59f 100644 --- a/tests/test_contrib/test_missing/test_dispersion.py +++ b/tests/test_contrib/test_missing/test_dispersion.py @@ -46,7 +46,7 @@ class TestMissingValuesDispersion(VisualTestCase): """ MissingValuesDispersion visualizer """ - + @pytest.mark.skipif(pd is None, reason="pandas is required") def test_missingvaluesdispersion_with_pandas(self): """ Integration test of visualizer with pandas @@ -72,6 +72,7 @@ def test_missingvaluesdispersion_with_pandas(self): self.assert_images_similar(viz, tol=self.tol) + @pytest.mark.skipif(pd is None, reason="pandas is required") def test_missingvaluesdispersion_with_pandas_with_y_targets(self): """ Integration test of visualizer with pandas with y targets diff --git a/tests/test_text/test_dispersion.py b/tests/test_text/test_dispersion.py index 49321e1cd..29e50db59 100644 --- a/tests/test_text/test_dispersion.py +++ b/tests/test_text/test_dispersion.py @@ -45,9 +45,9 @@ def test_quick_method(self): _, ax = plt.subplots() text = [doc.split() for doc in corpus.data] - target_words = ["Game", "player", "score", "oil", "Man"] + search_terms = ["Game", "player", "score", "oil", "Man"] - viz = dispersion(target_words=target_words, corpus=text, ax=ax, show=False) + viz = dispersion(search_terms=search_terms, corpus=text, ax=ax, show=False) viz.ax.grid(False) self.assert_images_similar(viz, tol=25) @@ -57,9 +57,9 @@ def test_integrated_dispersion_plot(self): Assert no errors occur during DispersionPlot integration """ text = [doc.split() for doc in corpus.data] - target_words = ["Game", "player", "score", "oil", "Man"] + search_terms = ["Game", "player", "score", "oil", "Man"] - visualizer = DispersionPlot(target_words) + visualizer = DispersionPlot(search_terms) visualizer.fit(text) visualizer.ax.grid(False) @@ -71,9 +71,9 @@ def test_dispersion_plot_ignore_case(self): with ignore_case parameter turned on """ text = [doc.split() for doc in corpus.data] - target_words = ["Game", "player", "score", "oil", "Man"] + search_terms = ["Game", "player", "score", "oil", "Man"] - visualizer = DispersionPlot(target_words, ignore_case=True) + visualizer = DispersionPlot(search_terms, ignore_case=True) visualizer.fit(text) visualizer.ax.grid(False) @@ -85,9 +85,9 @@ def test_dispersion_plot_generator_input(self): when the corpus' text type is a generator """ text = [doc.split() for doc in corpus.data] - target_words = ["Game", "player", "score", "oil", "Man"] + search_terms = ["Game", "player", "score", "oil", "Man"] - visualizer = DispersionPlot(target_words, ignore_case=True) + visualizer = DispersionPlot(search_terms, ignore_case=True) visualizer.fit(text) visualizer.ax.grid(False) @@ -99,9 +99,9 @@ def test_dispersion_plot_annotate_docs(self): with annotate_docs parameter turned on """ text = [doc.split() for doc in corpus.data] - target_words = ["girl", "she", "boy", "he", "man"] + search_terms = ["girl", "she", "boy", "he", "man"] - visualizer = DispersionPlot(target_words, annotate_docs=True) + visualizer = DispersionPlot(search_terms, annotate_docs=True) visualizer.fit(text) visualizer.ax.grid(False) @@ -114,9 +114,9 @@ def test_dispersion_plot_color_by_class(self): """ target = corpus.target text = [doc.split() for doc in corpus.data] - target_words = ["girl", "she", "boy", "he", "man"] + search_terms = ["girl", "she", "boy", "he", "man"] - visualizer = DispersionPlot(target_words) + visualizer = DispersionPlot(search_terms) visualizer.fit(text, target) visualizer.ax.grid(False) @@ -128,9 +128,9 @@ def test_dispersion_plot_mismatched_labels(self): """ target = corpus.target text = [doc.split() for doc in corpus.data] - target_words = ["girl", "she", "boy", "he", "man"] + search_terms = ["girl", "she", "boy", "he", "man"] - visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"]) + visualizer = DispersionPlot(search_terms, annotate_docs=True, labels=["a", "b"]) msg = ( r"number of supplied labels \(\d\) " diff --git a/yellowbrick/text/dispersion.py b/yellowbrick/text/dispersion.py index 4f7be3854..9917966c3 100644 --- a/yellowbrick/text/dispersion.py +++ b/yellowbrick/text/dispersion.py @@ -18,14 +18,15 @@ ## Imports ########################################################################## -from collections import defaultdict import itertools +from collections import defaultdict + +import numpy as np from yellowbrick.text.base import TextVisualizer from yellowbrick.style.colors import resolve_colors from yellowbrick.exceptions import YellowbrickValueError -import numpy as np ########################################################################## ## Dispersion Plot Visualizer @@ -34,31 +35,39 @@ class DispersionPlot(TextVisualizer): """ - DispersionPlotVisualizer allows for visualization of the lexical dispersion - of words in a corpus. Lexical dispersion is a measure of a word's - homeogeneity across the parts of a corpus. This plot notes the occurences - of a word and how many words from the beginning it appears. + Lexical dispersion illustrates the homogeneity of a word (or set of words) across + the documents of a corpus. + + DispersionPlot allows for visualization of the lexical dispersion of words in a + corpus. This plot illustrates with vertical lines the occurrences of one or more + search terms throughout the corpus, noting how many words relative to the beginning + of the corpus it appears. If the target vector of the corpus documents is provided, + the points will be colored with respect to their document category, which allows for + additional analysis of relationships in search term homogeneity within and across + document categories. If annotation is requested, document boundaries will be + displayed as vertical lines in the plot. Parameters ---------- - target_words : list - A list of target words whose dispersion across a corpus passed at fit - will be visualized. + search_terms : list + A list of search terms whose dispersion across a corpus passed at fit + should be visualized. ax : matplotlib axes, default: None The axes to plot the figure on. colors : list or tuple of colors - Specify the colors for each individual class + Specify the colors for each individual class. Will override colormap if both are + provided. colormap : string or matplotlib cmap Qualitative colormap for discrete target ignore_case : boolean, default: False - Specify whether input will be case-sensitive. + Specify whether input will be case-sensitive. annotate_docs : boolean, default: False - Specify whether document boundaries will be displayed. Vertical lines + Specify whether document boundaries will be displayed. Vertical lines are positioned at the end of each document. labels : list of strings @@ -68,8 +77,24 @@ class DispersionPlot(TextVisualizer): kwargs : dict Pass any additional keyword arguments to the super class. - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + Attributes + ---------- + self.classes_ : list + A list of strings representing the unique classes in the target in sorted order. + If ``y`` is provided, these are extracted from ``y``, unless a list of class + labels is provided by the user on instantiation. + + self.boundaries_ : list + A list of integers indicating the document boundaries with respect to + word offsets. + + self.indexed_words_ : list + A list of integers indicating the y position for each occurrence of each of + the search terms. + + self.word_categories_ : list + A list of strings indicating the corresponding document category of each search + term occurrence. """ # NOTE: cannot be np.nan @@ -77,7 +102,7 @@ class DispersionPlot(TextVisualizer): def __init__( self, - target_words, + search_terms, ax=None, colors=None, colormap=None, @@ -92,18 +117,28 @@ def __init__( self.colors = colors self.colormap = colormap - self.target_words = target_words self.ignore_case = ignore_case + self.search_terms = search_terms self.annotate_docs = annotate_docs - def _compute_dispersion(self, text, y): + def _compute_dispersion(self, X, y): + """ + Produces a generator containing the offset word count, y_coordinate, and + label/category for each occurrance of the search terms. + + Attributes + ---------- + self.boundaries_ : list + A list of integers indicating the document boundaries with respect to + word offsets. + """ self.boundaries_ = [] offset = 0 if y is None: y = itertools.repeat(None) - for doc, target in zip(text, y): + for doc, category in zip(X, y): for word in doc: if self.ignore_case: word = word.lower() @@ -114,18 +149,24 @@ def _compute_dispersion(self, text, y): offset += 1 for y_coord in (self.indexed_words_ == word).nonzero()[0]: y_coord = int(y_coord) - yield (offset, y_coord, target) + yield (offset, y_coord, category) + if self.annotate_docs: self.boundaries_.append(offset) + self.boundaries_ = np.array(self.boundaries_, dtype=int) def _check_missing_words(self, points): + """ + Helper method to raise an error if any of the requested search + terms do not appear in the corpus. + """ for index in range(len(self.indexed_words_)): if index in points[:, 1]: pass else: raise YellowbrickValueError( - ("The indexed word '{}' is not found in " "this corpus").format( + ("The search term '{}' is not found in " "this corpus").format( self.indexed_words_[index] ) ) @@ -147,47 +188,62 @@ def fit(self, X, y=None, **kwargs): instances. If this is specified, then the points will be colored according to their class. - kwargs : dict - Pass generic arguments to the drawing method - Returns ------- self : instance Returns the instance of the transformer/visualizer + + Attributes + ---------- + self.classes_ : list + A list of strings representing the unique classes in the target in sorted order. + If ``y`` is provided, these are extracted from ``y``, unless a list of class + labels is provided by the user on instantiation. + + self.indexed_words_ : list + A list of integers indicating the y position for each occurrence of each of + the search terms. + + self.word_categories_ : list + A list of strings indicating the corresponding document category of each search + term occurrence. """ if y is not None: self.classes_ = np.unique(y) - elif y is None and self.labels is not None: - self.classes_ = np.array([self.labels[0]]) else: self.classes_ = np.array([self.NULL_CLASS]) # Create an index (e.g. the y position) for the target words - self.indexed_words_ = np.flip(self.target_words, axis=0) + self.indexed_words_ = np.flip(self.search_terms, axis=0) if self.ignore_case: self.indexed_words_ = np.array([w.lower() for w in self.indexed_words_]) # Stack is used to create a 2D array from the generator try: - points_target = np.stack(self._compute_dispersion(X, y)) + offsets_positions_categories = np.stack(self._compute_dispersion(X, y)) except ValueError: - raise YellowbrickValueError(("No indexed words were found in the corpus")) - points = np.stack( - zip(points_target[:, 0].astype(int), points_target[:, 1].astype(int)) + raise YellowbrickValueError(("No search terms were found in the corpus")) + + word_positions = np.stack( + zip( + offsets_positions_categories[:, 0].astype(int), + offsets_positions_categories[:, 1].astype(int), + ) ) - self.target = points_target[:, 2] + self.word_categories_ = offsets_positions_categories[:, 2] - self._check_missing_words(points) + self._check_missing_words(word_positions) - self.draw(points, self.target) + self.draw(word_positions, **kwargs) return self - def draw(self, points, target=None, **kwargs): + def draw(self, points, **kwargs): """ Called from the fit method, this method creates the canvas and draws the plot on it. + Parameters ---------- kwargs: generic keyword arguments. @@ -205,7 +261,7 @@ def draw(self, points, target=None, **kwargs): # Create the color mapping for the labels. color_values = resolve_colors( - n_colors=len(labels), colormap=self.colormap, colors=self.color + n_colors=len(labels), colormap=self.colormap, colors=self.colors ) colors = dict(zip(labels, color_values)) @@ -219,8 +275,8 @@ def draw(self, points, target=None, **kwargs): series = defaultdict(lambda: {"x": [], "y": []}) - if target is not None: - for point, t in zip(points, target): + if self.word_categories_ is not None: + for point, t in zip(points, self.word_categories_): label = labels[t] series[label]["x"].append(point[0]) series[label]["y"].append(point[1]) @@ -258,9 +314,8 @@ def finalize(self, **kwargs): ----- Generally this method is called from show and not directly by the user. """ - + self.set_title("Lexical Dispersion Plot") self.ax.set_ylim(-1, len(self.indexed_words_)) - self.ax.set_title("Lexical Dispersion Plot") self.ax.set_xlabel("Word Offset") self.ax.grid(False) @@ -275,8 +330,9 @@ def finalize(self, **kwargs): ## Quick Method ########################################################################## + def dispersion( - target_words, + search_terms, corpus, y=None, ax=None, @@ -296,7 +352,7 @@ def dispersion( Parameters ---------- - target_words : list + search_terms : list A list of words whose dispersion will be examined within a corpus corpus : list @@ -312,7 +368,8 @@ def dispersion( The axes to plot the figure on. colors : list or tuple of colors - Specify the colors for each individual class + Specify the colors for each individual class. Will override colormap if both are + provided. colormap : string or matplotlib cmap Qualitative colormap for discrete target @@ -344,7 +401,7 @@ def dispersion( # Instantiate the visualizer visualizer = DispersionPlot( - target_words, + search_terms, ax=ax, colors=colors, colormap=colormap,