From 1bd983bdec611e14fe1ea98c3cec3e3cd3ddfc6b Mon Sep 17 00:00:00 2001 From: Thorsten Vitt Date: Fri, 18 Jun 2021 10:41:19 +0200 Subject: [PATCH] DistanceMatrix: compare_with builds a comparison table with metadata --- delta/deltas.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/delta/deltas.py b/delta/deltas.py index 830e599..51bbeb5 100644 --- a/delta/deltas.py +++ b/delta/deltas.py @@ -54,7 +54,7 @@ from scipy.special import comb from itertools import combinations from functools import update_wrapper -from .util import Metadata +from .util import Metadata, compare_pairwise from .corpus import Corpus from textwrap import dedent from sklearn.metrics import pairwise_distances @@ -676,7 +676,9 @@ def delta_values(self, transpose=False, check=True): check: if True and if the result does not contain any non-null value, try the other option for transpose. """ - return self._remove_duplicates(transpose, check).unstack().dropna() + result = self._remove_duplicates(transpose, check).unstack().dropna() + result.name = self.metadata.get('delta') + return result def delta_values_df(self): """ @@ -799,6 +801,21 @@ def evaluate(self): result["Simple Score"] = self.simple_score() return result + def compare_with(self, doc_metadata, comparisons=None, join='inner'): + """ + Compare the distance matrix value with values calculated from the given document metadata table. + + Args: + doc_metadata (pd.DataFrame): a dataframe with one row per document and arbitrary columns + comparisons: see `compare_pairwise` + join (str): inner (the default) or outer, if outer, keep pairs for which we have neither metadata nor comparisons. + + Returns: + a dataframe with a row for each pairwise document combination (as in `DistanceMatrix.delta_values`). + The first column will contain the delta values, subsequent columns the metadata comparisons. + """ + return pd.concat([self.delta_values(), compare_pairwise(doc_metadata, comparisons)], join=join, axis=1) + ################# Now a bunch of normalizations: