# Boosted Tangles
We had the idea of using a kind of boosting tactic with Tangles: As the cuts can come from multiple sources, just in principle, it might be a good idea to use Tangles as a clustering algorithm for the embeddings, especially in cases where SOE-kMeans has a high variance in its results.

In [16]:
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import altair as alt
import cblearn.datasets as datasets
from cblearn.datasets import make_random_triplets
from data_generation import generate_gmm_data_fixed_means
from sklearn.metrics import pairwise_distances
from questionnaire import Questionnaire, unify_triplet_order 
from estimators import OrdinalTangles, SoeKmeans
from cblearn.embedding import SOE
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from plotting import AltairPlotter
from sklearn.neighbors import DistanceMetric

In [17]:
seed = 8
data = generate_gmm_data_fixed_means(n=50, means=np.array([[-6,3], [6,3], [-6,-3]]), std=2, seed=seed)

In [18]:
soe_embedding_dim = 2
n_triplets = 5000

In [19]:
cut_list = []
soe_kmeans_nmi = []
n_iterations = 10

for i in range(n_iterations):
    triplets, responses = make_random_triplets(data.xs, "list-boolean", size=n_triplets, random_state=i)
    soe_kmeans = SoeKmeans(embedding_dimension=soe_embedding_dim, n_clusters=3)
    ys_soe = soe_kmeans.fit_predict(triplets, responses)
    soe_kmeans_nmi.append(normalized_mutual_info_score(data.ys, ys_soe))

    q = Questionnaire.from_metric(soe_kmeans.embedding_, density=0.01, verbose=False)
    cut_list.append(q.values)

print(f"SOE NMI average: {np.mean(soe_kmeans_nmi)} std: {np.std(soe_kmeans_nmi)}") 


SOE NMI average: 0.8598562037168284 std: 0.0191232120716224


In [20]:
boosted_cuts = np.hstack(cut_list)
tangles_boost = OrdinalTangles(20)
ys_boost = tangles_boost.fit_predict(np.hstack(cut_list))
print(f"NMI boost: {normalized_mutual_info_score(data.ys, ys_boost)} ({np.unique(ys_boost).size})")
boosted_cuts.shape

NMI boost: 0.6565191143081124 (2)


(150, 1120)

In [21]:
nmis = []

for j in range(10):
    triplets, responses = make_random_triplets(data.xs, "list-boolean", size=n_triplets * n_iterations, random_state=j)
    soe_kmeans = SoeKmeans(embedding_dimension=soe_embedding_dim, n_clusters=3)
    ys_soe = soe_kmeans.fit_predict(triplets, responses)

    nmis.append(normalized_mutual_info_score(data.ys, ys_soe))

print(f"Avg. SOE NMI with {n_triplets * n_iterations} triplets: {np.mean(nmis)}, std: {np.std(nmis)}")


Avg. SOE NMI with 50000 triplets: 0.8787991197048033, std: 0.009566855317196449


We assume that it could be the euclidean bias being mean to us here. 

In [22]:
seed = 8
data = generate_gmm_data_fixed_means(n=50, means=np.array([[-6,3], [6,3], [-6,-3]]), std=2.0, seed=seed)
p = AltairPlotter()
p.assignments(data.xs, data.ys)

In [23]:
q = Questionnaire.from_metric(data.xs, density=0.01, verbose=False)
soe_kmeans = SoeKmeans(2, 3)
ys_soe_kmeans = soe_kmeans.fit_predict(*q.to_bool_array(), data.ys)
ys_tangles = OrdinalTangles(20).fit_predict(q.values)
print(f"NMI SOE Kmeans: {normalized_mutual_info_score(data.ys, ys_soe_kmeans)}")
print(f"NMI Tangles: {normalized_mutual_info_score(data.ys, ys_tangles)} ({np.unique(ys_tangles).size})")

NMI SOE Kmeans: 0.8850620966553381
NMI Tangles: 0.6003101801753672 (2)


In [24]:
cut_list = []
soe_kmeans_nmi = []
n_iterations = 10

for i in range(n_iterations):
    q = Questionnaire.from_metric(data.xs, density=0.01, verbose=False)
    soe_kmeans = SoeKmeans(embedding_dimension=soe_embedding_dim, n_clusters=3)
    ys_soe = soe_kmeans.fit_predict(*q.to_bool_array())
    soe_kmeans_nmi.append(normalized_mutual_info_score(data.ys, ys_soe))

    q_emb = Questionnaire.from_metric(soe_kmeans.embedding_, density=0.1, verbose=False)
    cut_list.append(q_emb.values)

print(f"SOE NMI average: {np.mean(soe_kmeans_nmi)} std: {np.std(soe_kmeans_nmi)}") 
#cut_list.append(q.values)
p.assignments(soe_kmeans.embedding_, ys_soe)

SOE NMI average: 0.8709467577242173 std: 0.021547831393393734


In [25]:
tangles_boost = OrdinalTangles(20)
boosted_cuts = np.hstack(cut_list)
ys_boost = tangles_boost.fit_predict(boosted_cuts)
print(f"NMI boost: {normalized_mutual_info_score(data.ys, ys_boost)} ({np.unique(ys_boost).size})")
boosted_cuts.shape

NMI boost: 0.6565191143081124 (2)


(150, 11180)