# ComparisonHC

ComparisonHC is a non-embedding based clustering algorithm based that was develop in [Foundations of Comparison-Based Hierarchical Clustering](https://arxiv.org/abs/1811.00928?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%253A+arxiv%252FQSXk+%2528ExcitingAds%2521+cs+updates+on+arXiv.org%2529).

As this algorithm does not have any geometric bias, it might be useful to compare to the performance of the Tangles algorithm.

We use the passive versions of the algorithm. Essentially, there are two different
versions (4-AL and 4K-AL), but they perform mostly similar. 

In [1]:
import sys
sys.path.append("..")

from cblearn.datasets import fetch_car_similarity
from cblearn.preprocessing import triplets_from_mostcentral
from itertools import permutations
from comparisonhc import ComparisonHC
from comparisonhc.oracle import OracleComparisons
from comparisonhc.linkage import OrdinalLinkageAverage
import numpy as np
from typing import Optional
from sklearn.metrics import normalized_mutual_info_score
import comparison_hc as chc_est
from data_generation import generate_gmm_data_fixed_means
from cblearn.datasets import make_random_triplets
from triplets import reduce_triplets
from estimators import OrdinalTangles
from questionnaire import Questionnaire
from plotting import AltairPlotter

In [2]:
cars = fetch_car_similarity()

def unify_triplets_mostcentral(triplets, responses) -> np.ndarray:
    triplets_unified = []
    for t, r in zip(triplets, responses):
        a,b,c = t
        if r == 0:
            t_ = [a,b,c]
        elif r == 1:
            t_ = [b,c,a]
        elif r == 2:
            t_ = [c,a,b]
        else: 
            raise ValueError(f"Response must be 0, 1 or 2, not {r}")
        triplets_unified.append(t_) 
    res = np.array(triplets_unified)
    assert res.shape == triplets.shape
    return res

# reduce triplets
def reduce_triplets_mostcentral(triplets, responses) -> np.ndarray:
    unified_triplets = unify_triplets_mostcentral(triplets, responses)
    reduced_triplets = []
    for t in unified_triplets:
        if any(list(t_perm) in reduced_triplets for t_perm in permutations(t)):
            # we already added this
            continue
        a,b,c = t
        a_first_mask = np.all(triplets == np.array([a,b,c]), axis=1) | np.all(triplets == np.array([a,c,b]), axis=1)
        b_first_mask = np.all(triplets == np.array([b,a,c]), axis=1) | np.all(triplets == np.array([b,c,a]), axis=1)
        c_first_mask = np.all(triplets == np.array([c,a,b]), axis=1) | np.all(triplets == np.array([c,b,a]), axis=1)
        first_pick = np.argmax([a_first_mask.sum(), b_first_mask.sum(), c_first_mask.sum()])
        reduced_triplets.append([t[first_pick], t[(first_pick + 1) % 3], t[(first_pick + 2) % 3]])
        
    return np.array(reduced_triplets)

In [3]:
triplets_reduce_central = reduce_triplets_mostcentral(cars.triplet, cars.response)
t = triplets_from_mostcentral(triplets_reduce_central)
r = np.ones(t.shape[0]).astype(bool)

In [4]:
# possible UB if triplets have contradictions? ask David
# we want to keep consistent behaviour, should we put
# comparisonHC into cblearn
# t = triplets_from_mostcentral(cars.triplet, cars.response)
# r = np.ones(t.shape[0])
# mat = check_query_response(t, r, result_format="tensor-count").todense()
# print(mat[mat < -1])
# print(mat[mat > 1])

In [5]:
def triplets_to_quadruplets(triplets: np.ndarray, responses: Optional[np.ndarray] = None) -> np.ndarray:
    """
    Transforms an array of triplets (with responses) to an array of quadruplets.

    Assumes triplets, responses to be in list-boolean form, e.g. 

    responses[i] is True if triplets[i][0] is closer to triplets[i][1] than to 
    triplets[i][2].

    If responses is 0, we assume that all responses are true (e.g. it is always triplets[i][1] closer).

    We return a quadruplet matrix that is filled according to the following scheme:
    If the triplet array allows for a statement (a,b,c) in triplet form then we
    set quadruplet[a,b,a,c] = 1 quadruplet[]

    Triplets may contain duplicates or conflicting entries.
    In this case, we replace the value with a majority vote.
    """
    # error checking
    if len(triplets.shape) != 2:
        raise ValueError("Triplets must be a 2D array")
    if triplets.shape[1] != 3:
        raise ValueError("Triplets must have 3 columns")
    num_triplets = triplets.shape[0]
    if responses is None:
        responses = np.ones(num_triplets).astype(bool)
    if len(responses.shape) != 1:
        raise ValueError("Responses must be a 1D array or None")
    n = np.max(triplets) + 1
    q = np.zeros((n,n,n,n))

    for i in range(num_triplets):
        t = triplets[i]
        r = responses[i]
        if r:
            a,b,c = t[0], t[1], t[2]
        else:
            a,b,c = t[0], t[2], t[1]

        if q[a,b,a,c] != 0 or q[a,c,a,b] != 0:
            raise ValueError(f"Unreduced triplets found (or responses): {t, r, i}")
        q[a,b,a,c] = 1
        q[a,c,a,b] = -1
    return q

In [6]:
quads = triplets_to_quadruplets(t, r)
oracle = OracleComparisons(quads)
linkage = OrdinalLinkageAverage(oracle)
chc = ComparisonHC(linkage)
chc.fit([[i] for i in range(60)])

<comparisonhc.core.ComparisonHC at 0x1687a1130>

In [7]:
clusters = chc._get_k_clusters(chc.dendrogram, chc.clusters, 4)
# sum can essentially be used as a mapReduce / flatMap ;)
labels_in_order = sum([[i] * len(cluster) for i, cluster in enumerate(clusters)], [])
labels_for_original = [-1] * len(labels_in_order)
for lab, pos in zip(labels_in_order, sum(clusters, [])):
    labels_for_original[pos] = lab
assert -1 not in labels_for_original

In [8]:
normalized_mutual_info_score(cars.class_id, labels_for_original)

0.08906096664821156

We now test comparisonHC on synthetic data (with our provided estimator, to make things easier).

In [22]:
seed = 2
data = generate_gmm_data_fixed_means(10, np.array([[1,0], [-1, 0]]), 0.3, seed)
chc = chc_est.ComparisonHC(2)
t = reduce_triplets(*make_random_triplets(data.xs, result_format="list-boolean", size=5000, random_state=seed))
y_chc = chc.fit_predict(t)

In [23]:
print(f"CHC performance: {normalized_mutual_info_score(y_chc, data.ys)}")
p = AltairPlotter()
p.assignments(data.xs, y_chc)

CHC performance: 1.0


Compare to Tangles:

In [11]:
q = Questionnaire.from_metric(data.xs, density=0.1)
(q.values != -1).size

tangles = OrdinalTangles(7)
print(f"Tangles performance: {tangles.score(q.values, data.ys)}")

Generating questionnaire...
Generating question set...
Filling out questionnaire...


100%|██████████| 40/40 [00:00<00:00, 8422.72it/s]

Tangles performance: 1.0



