From d782a2b06a159f40095b3ac06bee4191b6dd44de Mon Sep 17 00:00:00 2001 From: Thorsten Vitt Date: Thu, 25 Aug 2016 10:39:02 +0200 Subject: [PATCH] Refactored dendrogram to new graphics submodule --- delta/__init__.py | 13 +++-- delta/cluster.py | 134 +++++----------------------------------------- delta/graphics.py | 124 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 125 deletions(-) create mode 100644 delta/graphics.py diff --git a/delta/__init__.py b/delta/__init__.py index 5428427..8ddcd9a 100644 --- a/delta/__init__.py +++ b/delta/__init__.py @@ -11,12 +11,15 @@ __version__ = '2.0.0' __author__ = 'Fotis Jannidis, Thorsten Vitt' -from .corpus import Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN -from .deltas import registry, Normalization, DeltaFunction, \ +from delta.corpus import Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN +from delta.deltas import registry, Normalization, DeltaFunction, \ PDistDeltaFunction, CompositeDeltaFunction -from .cluster import Clustering -from .features import get_rfe_features +from delta.cluster import Clustering, KMedoidsClustering, FlatClustering +from delta.features import get_rfe_features +from delta.graphics import Dendrogram __all__ = [ Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN, registry, Normalization, - DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction, Clustering, get_rfe_features ] + DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction, + Clustering, FlatClustering, KMedoidsClustering, + get_rfe_features, Dendrogram ] diff --git a/delta/cluster.py b/delta/cluster.py index 0d350da..2573dd5 100644 --- a/delta/cluster.py +++ b/delta/cluster.py @@ -1,5 +1,13 @@ # -*- encoding: utf-8 -*- """ +Clustering of distance matrixes. + +:class:`Clustering` represents a hierarchical clustering which can be flattened +using :meth:`Clustering.fcluster`, the flattened clustering is then represented +by :class:`FlatClustering`. + +If supported by the installed version of scikit-learn, there is also a +KMedoidsClustering. """ import logging @@ -10,21 +18,15 @@ import pandas as pd import scipy.spatial.distance as ssd import scipy.cluster.hierarchy as sch -import matplotlib as mpl -import matplotlib.pyplot as plt -# from scipy import linalg -# from scipy.misc import comb -# from itertools import combinations -# from functools import update_wrapper -from .util import Metadata -from .deltas import DistanceMatrix -from .corpus import Corpus +from delta.util import Metadata +from delta.deltas import DistanceMatrix +from delta.corpus import Corpus from sklearn import metrics class Clustering: """ - Represents a clustering. + Represents a hierarchical clustering. Note: This is subject to refactoring once we implement more clustering @@ -65,115 +67,6 @@ def fclustering(self): return flat -class Dendrogram: - """ - Creates a dendrogram representation from a hierarchical clustering. - - This is a wrapper around, and an improvement to, :func:`sch.dendrogram`, - tailored for the use in pydelta. - - Args: - clustering (Clustering): A hierarchical clustering. - describer (DocumentDescriber): Document describer used for determining - the groups and the labels for the documents used (optional). By - default, the document describer inherited from the clustering is - used. - ax (mpl.axes.Axes): Axes object to draw on. Uses pyplot default axes if - not provided. - orientation (str): Orientation of the dendrogram. Currently, only - "right" is supported (default). - font_size: Font size for the label, in points. If not provided, - :func:`sch.dendrogram` calculates a default. - link_color (str): The color used for the links in the dendrogram, by - default ``k`` (for black). - title (str): a title that will be printed on the plot. The string may - be a template string as supported by :meth:`str.format_map` with - metadata field names in curly braces, it will be evaluated against - the clustering's metadata. If you pass ``None`` here, no title will - be added. - - Notes: - The dendrogram will be painted by matplotlib / pyplot using the default - styles, which means you can use, e.g., :module:`seaborn` to influence - the overall design of the image. - - :class:`Dendrogram` handles coloring differently than - :func:`sch.dendrogram`: It will color the document labels according to - the pre-assigned grouping (e.g., by author). To do so, it will build on - matplotlib's default color_cycle, and it will rotate, so if you need - more colors, adjust the color_cycle accordingly. - """ - - def __init__(self, clustering, describer=None, ax=None, - orientation="left", font_size=None, link_color="k", - title="Corpus: {corpus}", - xlabel="Delta: {delta_title}, {words} most frequent {features}"): - - self.clustering = clustering - self.linkage = clustering.linkage - self.metadata = clustering.metadata - self.describer = clustering.describer \ - if describer is None else describer - self.documents = list(clustering.distance_matrix.index) - self.orientation = orientation - self._init_colormap() - - plt.clf() - self.dendro_data = sch.dendrogram(self.linkage, - orientation=orientation, - labels=self.documents, - leaf_rotation = 0 if orientation == 'left' else 90, - ax=ax, - link_color_func=lambda k: link_color) - - # Now redo the author labels. To do so, we map a color to each author - # (using the describer) and then - self.ax = plt.gca() if ax is None else ax - self.fig = plt.gcf() - self._relabel_axis() - if title is not None: - plt.title(title.format_map(self.metadata)) - if xlabel is not None: - plt.xlabel(xlabel.format_map(self.metadata)) - plt.tight_layout(2) - - def link_color_func(self, k): - print(k) - return "k" - - def _init_colormap(self): - groups = self.describer.groups(self.documents) - props = mpl.rcParams['axes.prop_cycle'] - self.colormap = {x: y['color'] for x,y in zip(groups, props())} - self.colorlist = [self.colormap[self.describer.group_name(doc)] - for doc in self.documents] - return self.colormap - - def _relabel_axis(self): - if self.orientation == 'left': - labels = self.ax.get_ymajorticklabels() - else: - labels = self.ax.get_xmajorticklabels() - display_labels = [] - for label in labels: - group = self.describer.group_name(label.get_text()) - label.set_color(self.colormap[group]) - display_label = self.describer.label(label.get_text()) - label.set_text(display_label) # doesn't really set the labels - display_labels.append(display_label) - if self.orientation == 'left': - self.ax.set_yticklabels(display_labels) - else: - self.ax.set_xticklabels(display_labels) - - - def show(self): - plt.show() - - def save(self, fname, **kwargs): - self.fig.savefig(fname, **kwargs) - - class FlatClustering: """ A flat clustering represents a non-hierarchical clustering. @@ -343,7 +236,8 @@ def describe(self): class KMedoidsClustering(FlatClustering): - def __init__(self, distances, n_clusters=None, metadata=None, **kwargs): + def __init__(self, distances, n_clusters=None, metadata=None, + **kwargs): super().__init__(distances, metadata, **kwargs) if n_clusters is None: n_clusters = self.group_count diff --git a/delta/graphics.py b/delta/graphics.py new file mode 100644 index 0000000..c7e3464 --- /dev/null +++ b/delta/graphics.py @@ -0,0 +1,124 @@ +# -*- encoding: utf-8 -*- +""" +Various visualization tools. +""" + +import logging +logger = logging.getLogger(__name__) + +import scipy.cluster.hierarchy as sch +import matplotlib as mpl +import matplotlib.pyplot as plt +# from scipy import linalg +# from scipy.misc import comb +# from itertools import combinations +# from functools import update_wrapper + + +class Dendrogram: + """ + Creates a dendrogram representation from a hierarchical clustering. + + This is a wrapper around, and an improvement to, :func:`sch.dendrogram`, + tailored for the use in pydelta. + + Args: + clustering (Clustering): A hierarchical clustering. + describer (DocumentDescriber): Document describer used for determining + the groups and the labels for the documents used (optional). By + default, the document describer inherited from the clustering is + used. + ax (mpl.axes.Axes): Axes object to draw on. Uses pyplot default axes if + not provided. + orientation (str): Orientation of the dendrogram. Currently, only + "right" is supported (default). + font_size: Font size for the label, in points. If not provided, + :func:`sch.dendrogram` calculates a default. + link_color (str): The color used for the links in the dendrogram, by + default ``k`` (for black). + title (str): a title that will be printed on the plot. The string may + be a template string as supported by :meth:`str.format_map` with + metadata field names in curly braces, it will be evaluated against + the clustering's metadata. If you pass ``None`` here, no title will + be added. + + Notes: + The dendrogram will be painted by matplotlib / pyplot using the default + styles, which means you can use, e.g., :module:`seaborn` to influence + the overall design of the image. + + :class:`Dendrogram` handles coloring differently than + :func:`sch.dendrogram`: It will color the document labels according to + the pre-assigned grouping (e.g., by author). To do so, it will build on + matplotlib's default color_cycle, and it will rotate, so if you need + more colors, adjust the color_cycle accordingly. + """ + + def __init__(self, clustering, describer=None, ax=None, + orientation="left", font_size=None, link_color="k", + title="Corpus: {corpus}", + xlabel="Delta: {delta_title}, {words} most frequent {features}"): + + self.clustering = clustering + self.linkage = clustering.linkage + self.metadata = clustering.metadata + self.describer = clustering.describer \ + if describer is None else describer + self.documents = list(clustering.distance_matrix.index) + self.orientation = orientation + self._init_colormap() + + plt.clf() + self.dendro_data = sch.dendrogram(self.linkage, + orientation=orientation, + labels=self.documents, + leaf_rotation = 0 if orientation == 'left' else 90, + ax=ax, + link_color_func=lambda k: link_color) + + # Now redo the author labels. To do so, we map a color to each author + # (using the describer) and then + self.ax = plt.gca() if ax is None else ax + self.fig = plt.gcf() + self._relabel_axis() + if title is not None: + plt.title(title.format_map(self.metadata)) + if xlabel is not None: + plt.xlabel(xlabel.format_map(self.metadata)) + plt.tight_layout(2) + + def link_color_func(self, k): + print(k) + return "k" + + def _init_colormap(self): + groups = self.describer.groups(self.documents) + props = mpl.rcParams['axes.prop_cycle'] + self.colormap = {x: y['color'] for x,y in zip(groups, props())} + self.colorlist = [self.colormap[self.describer.group_name(doc)] + for doc in self.documents] + return self.colormap + + def _relabel_axis(self): + if self.orientation == 'left': + labels = self.ax.get_ymajorticklabels() + else: + labels = self.ax.get_xmajorticklabels() + display_labels = [] + for label in labels: + group = self.describer.group_name(label.get_text()) + label.set_color(self.colormap[group]) + display_label = self.describer.label(label.get_text()) + label.set_text(display_label) # doesn't really set the labels + display_labels.append(display_label) + if self.orientation == 'left': + self.ax.set_yticklabels(display_labels) + else: + self.ax.set_xticklabels(display_labels) + + + def show(self): + plt.show() + + def save(self, fname, **kwargs): + self.fig.savefig(fname, **kwargs)