# More real world data
As we have seen that Tangles are particularly good at clustering the outputs of SOE, we might want to explore other real world datasets.

In [1]:
import sys
sys.path.append("..")
import cblearn.datasets as datasets
import numpy as np
from cblearn.preprocessing import triplets_from_mostcentral
from sklearn.metrics import normalized_mutual_info_score
from questionnaire import Questionnaire
from triplets import LensMetric
from estimators import OrdinalTangles, SoeKmeans
from triplets import triplets_to_majority_neighbour_cuts, unify_triplet_order, subsample_triplets, majority_neighbours_count_matrix
from sklearn.neighbors import DistanceMetric
from data_generation import generate_gmm_data_fixed_means, get_usps

In [2]:
def evaluate_real_world_data(triplets, responses, ys, agreement, embedding_dimension, clusters, maj_agreement=None, maj_radius=None, seed=0):
    # soe-kmeans
    soe_kmeans = SoeKmeans(embedding_dimension=embedding_dimension, n_clusters=clusters, seed=seed)
    ys_soe = soe_kmeans.fit_predict(triplets, responses)
    ys_tangles_maj = None

    # tangles
    q = Questionnaire.from_metric(soe_kmeans.embedding_, verbose=False)
    tangles = OrdinalTangles(agreement=agreement)
    ys_tangles = tangles.fit_predict(q.values)
    if maj_agreement is not None and maj_radius is not None:
        triplets_unified = unify_triplet_order(triplets, responses)
        tangles_maj = OrdinalTangles(agreement=maj_agreement)
        ys_tangles_maj = tangles_maj.fit_predict(triplets_to_majority_neighbour_cuts(triplets_unified, radius=maj_radius))

    # direct kMeans
    print(f"NMI SOE-kMeans: {normalized_mutual_info_score(ys, ys_soe)}")
    print(f"NMI Tangles: {normalized_mutual_info_score(ys, ys_tangles)} ({np.unique(ys_tangles).shape[0]})")
    if ys_tangles_maj is not None:
        print(f"NMI Tangles Majority Cuts: {normalized_mutual_info_score(ys, ys_tangles_maj)} ({np.unique(ys_tangles_maj).shape[0]})")

In [3]:
print("CAR DATASET")
# First we are setting up the data
cars = datasets.fetch_car_similarity()
n_clusters = len(cars.class_name.tolist())
triplets = triplets_from_mostcentral(cars.triplet, cars.response)
# respones are just set to always be one (0 would yield the same clustering result)
responses = np.ones(triplets.shape[0]).astype(bool) 
ys = cars.class_id
evaluate_real_world_data(triplets, responses, ys, agreement=8, embedding_dimension=2, clusters=4 )

CAR DATASET
NMI SOE-kMeans: 0.6367123603818012
NMI Tangles: 0.8701404722453094 (3)


# USPS dataset
In a similar vein to Kleindessner 2017, Vankadara et al. 2019, we use the USPS dataset simply with the euclidean metric.

In [4]:
data = get_usps(shuffle=True, seed=1, subset={1,2,3}, num_samples=30)
central_triplets, central_responses = subsample_triplets(data.xs, 50000, return_mostcentral=True)
triplets = triplets_from_mostcentral(central_triplets, central_responses)
# respones are just set to always be one (0 would yield the same clustering result)
responses = np.ones(triplets.shape[0]).astype(bool) 

In [5]:
soe_kmeans = SoeKmeans(embedding_dimension=2, n_clusters=3, seed=1)
ys_soe_kmeans = soe_kmeans.fit_predict(triplets, responses)
print(f"NMI SOE: {normalized_mutual_info_score(data.ys, ys_soe_kmeans)}")

NMI SOE: 0.6672030093941853


In [6]:
q = Questionnaire.from_most_central_triplets(central_triplets, central_responses, density=0.1)
tangles = OrdinalTangles(agreement=8)
ys_tangles = tangles.fit_predict(q.values)
print(f"NMI Tangles Lens: {normalized_mutual_info_score(data.ys, ys_tangles)} ({np.unique(ys_tangles).shape[0]})")

cuts = majority_neighbours_count_matrix(triplets, symmetric=False) > 0
tangles = OrdinalTangles(agreement=8)
ys_tangles = tangles.fit_predict(cuts)
print(f"NMI Tangles Majority: {normalized_mutual_info_score(data.ys, ys_tangles)} ({np.unique(ys_tangles).shape[0]})")

Generating questionnaire...
Generating question set...
Filling out questionnaire...


100%|██████████| 90/90 [00:00<00:00, 4916.93it/s]

NMI Tangles Lens: 0.5164267263128485 (2)





NMI Tangles Majority: 0.445749087856816 (3)


In [7]:
central_triplets, central_responses = subsample_triplets(data.xs, 50000, return_mostcentral=True)
triplets = triplets_from_mostcentral(central_triplets, central_responses)
cuts = majority_neighbours_count_matrix(triplets, symmetric=True) > 0

tangles = OrdinalTangles(agreement=10)
ys_tangles = tangles.fit_predict(cuts)
print(f"NMI Tangles Majority: {normalized_mutual_info_score(data.ys, ys_tangles)} ({np.unique(ys_tangles).shape[0]})")

NMI Tangles Majority: 0.44994507711365483 (3)


  return (array - np.min(array)) / np.ptp(array)


## Synthetic data
Other data actually does not exist. We could think about labeling it ourselves?
Otherwise we try synthetic data now.

In [8]:
seed = 8
data = generate_gmm_data_fixed_means(n=20, means=np.array([[-6,3,6,-2,-3], [6,3,-6,-2,3], [-6,-3,6,2,4], [-6,-3,-6,2,-4]]), std=1.5, seed=1)
minkowski_1_5 = DistanceMetric.get_metric("minkowski", p=1.5)
triplets, responses = subsample_triplets(data.xs, 2000, metric=minkowski_1_5)

In [9]:
evaluate_real_world_data(triplets, responses, data.ys, agreement=8, embedding_dimension=8, clusters=6)

NMI SOE-kMeans: 0.9119974678160138
NMI Tangles: 0.8571428571428571 (3)
