Skip to content

Commit

Permalink
Use scikit-learn's pairwise distances
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Apr 19, 2017
1 parent 0081809 commit 324a47e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 11 deletions.
6 changes: 4 additions & 2 deletions delta/__init__.py
Expand Up @@ -14,15 +14,17 @@
from warnings import warn
from delta.corpus import Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN
from delta.deltas import registry as functions, normalization, Normalization, \
DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction
DeltaFunction, PDistDeltaFunction, MetricDeltaFunction, \
CompositeDeltaFunction
from delta.cluster import Clustering, FlatClustering

from delta.features import get_rfe_features
from delta.graphics import Dendrogram

__all__ = [ Corpus, FeatureGenerator, LETTERS_PATTERN, WORD_PATTERN,
functions, Normalization, normalization,
DeltaFunction, PDistDeltaFunction, CompositeDeltaFunction,
DeltaFunction, PDistDeltaFunction,
MetricDeltaFunction, CompositeDeltaFunction,
Clustering, FlatClustering,
get_rfe_features, Dendrogram ]

Expand Down
70 changes: 61 additions & 9 deletions delta/deltas.py
Expand Up @@ -54,6 +54,7 @@
from .util import Metadata
from .corpus import Corpus
from textwrap import dedent
from sklearn.metrics import pairwise_distances

sep = '-' # separates parts of a descriptor

Expand Down Expand Up @@ -482,6 +483,9 @@ def __call__(self, corpus):
class PDistDeltaFunction(DeltaFunction):
"""
Wraps one of the metrics implemented by :func:`ssd.pdist` as a delta function.
Warning:
You should use MetricDeltaFunction instead.
"""
def __init__(self, metric, name=None, title=None, register=True, scale=False, **kwargs):
"""
Expand All @@ -492,6 +496,8 @@ def __init__(self, metric, name=None, title=None, register=True, scale=False, **
register (bool): If false, don't register this with the registry
**kwargs: passed on to :func:`ssd.pdist`
"""
logger.warning("Prefer MetricsDeltaFunction to PDistDeltaFunction.")

self.metric = metric
self.kwargs = kwargs
if name is None:
Expand All @@ -502,15 +508,61 @@ def __init__(self, metric, name=None, title=None, register=True, scale=False, **

super().__init__(descriptor=name, name=name, title=title, register=register)


def __call__(self, corpus):
df = pd.DataFrame(index=corpus.index, columns=corpus.index,
data=ssd.squareform(ssd.pdist(corpus, self.metric,
self.kwargs)))
**self.kwargs)))
if self.scale:
df = df / corpus.columns.size
return self.create_result(df, corpus)


class MetricDeltaFunction(DeltaFunction):
"""
Distance functions based on scikit-learn's :func:`sklearn.metric.pairwise_distances`.
"""

def __init__(self, metric, name=None, title=None, register=True, scale=False, fix_symmetry=True, **kwargs):
"""
Args:
metric (str): The metric that should be called via sklearn.metric.pairwise_distances
name (str): Name / Descriptor for the delta function, if None, metric is used
title (str): Human-Readable Title
register (bool): If false, don't register this with the registry
scale (bool): Scale by number of features
fix_symmetry: Force the resulting matrix to be symmetric
**kwargs: passed on to :func:`ssd.pdist`
Note:
:func:`sklearn.metric.pairwise_distances` fast, but the result may
not be exactly symmetric. The `fix_symmetry` option enforces
symmetry by mirroring the lower-left triangle after calculating
distances so, e.g., scipy clustering won't complain.
"""
self.metric = metric
self.scale = scale
self.fix_symmetry = fix_symmetry
self.kwargs = kwargs
if name is None:
name = metric
if title is None:
title = name.title() + " Distance"
super().__init__(descriptor=name, name=name, title=title, register=register)

def __call__(self, corpus):
dm = pairwise_distances(corpus, metric=self.metric, n_jobs=-1, **self.kwargs)
if self.fix_symmetry:
dm = np.tril(dm, -1)
dm += dm.T
df = pd.DataFrame(data=dm, index=corpus.index, columns=corpus.index)
if self.scale:
df = df / corpus.columns.size
np.fill_diagonal(df.values, 0) # rounding errors may lead to validation bugs
return self.create_result(df, corpus)



class DistanceMatrix(pd.DataFrame):
"""
A distance matrix is the result of applying a :class:`DeltaFunction` to a
Expand Down Expand Up @@ -770,14 +822,14 @@ def ternarize(corpus, lower_bound=-0.43, upper_bound=0.43):

################ Here come the deltas

PDistDeltaFunction("cityblock", "manhattan", title="Manhattan Distance", scale=True)
PDistDeltaFunction("euclidean")
PDistDeltaFunction("sqeuclidean", title="Squared Euclidean Distance")
PDistDeltaFunction("cosine")
PDistDeltaFunction("canberra")
PDistDeltaFunction("braycurtis", title="Bray-Curtis Distance")
PDistDeltaFunction("correlation")
PDistDeltaFunction("chebyshev")
MetricDeltaFunction("cityblock", "manhattan", title="Manhattan Distance", scale=True)
MetricDeltaFunction("euclidean")
MetricDeltaFunction("sqeuclidean", title="Squared Euclidean Distance")
MetricDeltaFunction("cosine")
MetricDeltaFunction("canberra")
MetricDeltaFunction("braycurtis", title="Bray-Curtis Distance")
MetricDeltaFunction("correlation")
MetricDeltaFunction("chebyshev")

CompositeDeltaFunction("manhattan-z_score", "burrows", "Burrows' Delta")
CompositeDeltaFunction("sqeuclidean-z_score", "quadratic", "Quadratic Delta")
Expand Down

0 comments on commit 324a47e

Please sign in to comment.