Skip to content

Commit

Permalink
Refactored dendrogram to new graphics submodule
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Aug 25, 2016
1 parent 814991f commit d782a2b
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 125 deletions.
13 changes: 8 additions & 5 deletions delta/__init__.py
Expand Up @@ -11,12 +11,15 @@
__version__ = '2.0.0'
__author__ = 'Fotis Jannidis, Thorsten Vitt'

from .corpus import Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN
from .deltas import registry, Normalization, DeltaFunction, \
from delta.corpus import Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN
from delta.deltas import registry, Normalization, DeltaFunction, \
PDistDeltaFunction, CompositeDeltaFunction
from .cluster import Clustering
from .features import get_rfe_features
from delta.cluster import Clustering, KMedoidsClustering, FlatClustering
from delta.features import get_rfe_features
from delta.graphics import Dendrogram

__all__ = [ Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN,
registry, Normalization,
DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction, Clustering, get_rfe_features ]
DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction,
Clustering, FlatClustering, KMedoidsClustering,
get_rfe_features, Dendrogram ]
134 changes: 14 additions & 120 deletions delta/cluster.py
@@ -1,5 +1,13 @@
# -*- encoding: utf-8 -*-
"""
Clustering of distance matrixes.
:class:`Clustering` represents a hierarchical clustering which can be flattened
using :meth:`Clustering.fcluster`, the flattened clustering is then represented
by :class:`FlatClustering`.
If supported by the installed version of scikit-learn, there is also a
KMedoidsClustering.
"""

import logging
Expand All @@ -10,21 +18,15 @@
import pandas as pd
import scipy.spatial.distance as ssd
import scipy.cluster.hierarchy as sch
import matplotlib as mpl
import matplotlib.pyplot as plt
# from scipy import linalg
# from scipy.misc import comb
# from itertools import combinations
# from functools import update_wrapper
from .util import Metadata
from .deltas import DistanceMatrix
from .corpus import Corpus
from delta.util import Metadata
from delta.deltas import DistanceMatrix
from delta.corpus import Corpus
from sklearn import metrics


class Clustering:
"""
Represents a clustering.
Represents a hierarchical clustering.
Note:
This is subject to refactoring once we implement more clustering
Expand Down Expand Up @@ -65,115 +67,6 @@ def fclustering(self):
return flat


class Dendrogram:
"""
Creates a dendrogram representation from a hierarchical clustering.
This is a wrapper around, and an improvement to, :func:`sch.dendrogram`,
tailored for the use in pydelta.
Args:
clustering (Clustering): A hierarchical clustering.
describer (DocumentDescriber): Document describer used for determining
the groups and the labels for the documents used (optional). By
default, the document describer inherited from the clustering is
used.
ax (mpl.axes.Axes): Axes object to draw on. Uses pyplot default axes if
not provided.
orientation (str): Orientation of the dendrogram. Currently, only
"right" is supported (default).
font_size: Font size for the label, in points. If not provided,
:func:`sch.dendrogram` calculates a default.
link_color (str): The color used for the links in the dendrogram, by
default ``k`` (for black).
title (str): a title that will be printed on the plot. The string may
be a template string as supported by :meth:`str.format_map` with
metadata field names in curly braces, it will be evaluated against
the clustering's metadata. If you pass ``None`` here, no title will
be added.
Notes:
The dendrogram will be painted by matplotlib / pyplot using the default
styles, which means you can use, e.g., :module:`seaborn` to influence
the overall design of the image.
:class:`Dendrogram` handles coloring differently than
:func:`sch.dendrogram`: It will color the document labels according to
the pre-assigned grouping (e.g., by author). To do so, it will build on
matplotlib's default color_cycle, and it will rotate, so if you need
more colors, adjust the color_cycle accordingly.
"""

def __init__(self, clustering, describer=None, ax=None,
orientation="left", font_size=None, link_color="k",
title="Corpus: {corpus}",
xlabel="Delta: {delta_title}, {words} most frequent {features}"):

self.clustering = clustering
self.linkage = clustering.linkage
self.metadata = clustering.metadata
self.describer = clustering.describer \
if describer is None else describer
self.documents = list(clustering.distance_matrix.index)
self.orientation = orientation
self._init_colormap()

plt.clf()
self.dendro_data = sch.dendrogram(self.linkage,
orientation=orientation,
labels=self.documents,
leaf_rotation = 0 if orientation == 'left' else 90,
ax=ax,
link_color_func=lambda k: link_color)

# Now redo the author labels. To do so, we map a color to each author
# (using the describer) and then
self.ax = plt.gca() if ax is None else ax
self.fig = plt.gcf()
self._relabel_axis()
if title is not None:
plt.title(title.format_map(self.metadata))
if xlabel is not None:
plt.xlabel(xlabel.format_map(self.metadata))
plt.tight_layout(2)

def link_color_func(self, k):
print(k)
return "k"

def _init_colormap(self):
groups = self.describer.groups(self.documents)
props = mpl.rcParams['axes.prop_cycle']
self.colormap = {x: y['color'] for x,y in zip(groups, props())}
self.colorlist = [self.colormap[self.describer.group_name(doc)]
for doc in self.documents]
return self.colormap

def _relabel_axis(self):
if self.orientation == 'left':
labels = self.ax.get_ymajorticklabels()
else:
labels = self.ax.get_xmajorticklabels()
display_labels = []
for label in labels:
group = self.describer.group_name(label.get_text())
label.set_color(self.colormap[group])
display_label = self.describer.label(label.get_text())
label.set_text(display_label) # doesn't really set the labels
display_labels.append(display_label)
if self.orientation == 'left':
self.ax.set_yticklabels(display_labels)
else:
self.ax.set_xticklabels(display_labels)


def show(self):
plt.show()

def save(self, fname, **kwargs):
self.fig.savefig(fname, **kwargs)


class FlatClustering:
"""
A flat clustering represents a non-hierarchical clustering.
Expand Down Expand Up @@ -343,7 +236,8 @@ def describe(self):

class KMedoidsClustering(FlatClustering):

def __init__(self, distances, n_clusters=None, metadata=None, **kwargs):
def __init__(self, distances, n_clusters=None, metadata=None,
**kwargs):
super().__init__(distances, metadata, **kwargs)
if n_clusters is None:
n_clusters = self.group_count
Expand Down
124 changes: 124 additions & 0 deletions delta/graphics.py
@@ -0,0 +1,124 @@
# -*- encoding: utf-8 -*-
"""
Various visualization tools.
"""

import logging
logger = logging.getLogger(__name__)

import scipy.cluster.hierarchy as sch
import matplotlib as mpl
import matplotlib.pyplot as plt
# from scipy import linalg
# from scipy.misc import comb
# from itertools import combinations
# from functools import update_wrapper


class Dendrogram:
"""
Creates a dendrogram representation from a hierarchical clustering.
This is a wrapper around, and an improvement to, :func:`sch.dendrogram`,
tailored for the use in pydelta.
Args:
clustering (Clustering): A hierarchical clustering.
describer (DocumentDescriber): Document describer used for determining
the groups and the labels for the documents used (optional). By
default, the document describer inherited from the clustering is
used.
ax (mpl.axes.Axes): Axes object to draw on. Uses pyplot default axes if
not provided.
orientation (str): Orientation of the dendrogram. Currently, only
"right" is supported (default).
font_size: Font size for the label, in points. If not provided,
:func:`sch.dendrogram` calculates a default.
link_color (str): The color used for the links in the dendrogram, by
default ``k`` (for black).
title (str): a title that will be printed on the plot. The string may
be a template string as supported by :meth:`str.format_map` with
metadata field names in curly braces, it will be evaluated against
the clustering's metadata. If you pass ``None`` here, no title will
be added.
Notes:
The dendrogram will be painted by matplotlib / pyplot using the default
styles, which means you can use, e.g., :module:`seaborn` to influence
the overall design of the image.
:class:`Dendrogram` handles coloring differently than
:func:`sch.dendrogram`: It will color the document labels according to
the pre-assigned grouping (e.g., by author). To do so, it will build on
matplotlib's default color_cycle, and it will rotate, so if you need
more colors, adjust the color_cycle accordingly.
"""

def __init__(self, clustering, describer=None, ax=None,
orientation="left", font_size=None, link_color="k",
title="Corpus: {corpus}",
xlabel="Delta: {delta_title}, {words} most frequent {features}"):

self.clustering = clustering
self.linkage = clustering.linkage
self.metadata = clustering.metadata
self.describer = clustering.describer \
if describer is None else describer
self.documents = list(clustering.distance_matrix.index)
self.orientation = orientation
self._init_colormap()

plt.clf()
self.dendro_data = sch.dendrogram(self.linkage,
orientation=orientation,
labels=self.documents,
leaf_rotation = 0 if orientation == 'left' else 90,
ax=ax,
link_color_func=lambda k: link_color)

# Now redo the author labels. To do so, we map a color to each author
# (using the describer) and then
self.ax = plt.gca() if ax is None else ax
self.fig = plt.gcf()
self._relabel_axis()
if title is not None:
plt.title(title.format_map(self.metadata))
if xlabel is not None:
plt.xlabel(xlabel.format_map(self.metadata))
plt.tight_layout(2)

def link_color_func(self, k):
print(k)
return "k"

def _init_colormap(self):
groups = self.describer.groups(self.documents)
props = mpl.rcParams['axes.prop_cycle']
self.colormap = {x: y['color'] for x,y in zip(groups, props())}
self.colorlist = [self.colormap[self.describer.group_name(doc)]
for doc in self.documents]
return self.colormap

def _relabel_axis(self):
if self.orientation == 'left':
labels = self.ax.get_ymajorticklabels()
else:
labels = self.ax.get_xmajorticklabels()
display_labels = []
for label in labels:
group = self.describer.group_name(label.get_text())
label.set_color(self.colormap[group])
display_label = self.describer.label(label.get_text())
label.set_text(display_label) # doesn't really set the labels
display_labels.append(display_label)
if self.orientation == 'left':
self.ax.set_yticklabels(display_labels)
else:
self.ax.set_xticklabels(display_labels)


def show(self):
plt.show()

def save(self, fname, **kwargs):
self.fig.savefig(fname, **kwargs)

0 comments on commit d782a2b

Please sign in to comment.