# Stochastic matrix neighbours
Our idea was to include more distance information in our majority neighbour cuts via a
sort-of stochastic matrix.

In [1]:
import sys
sys.path.append("..")
import numpy as np
import cblearn.datasets as datasets
from data_generation import generate_gmm_data_fixed_means
from questionnaire import Questionnaire
from estimators import OrdinalTangles, SoeKmeans
from cblearn.embedding import SOE
from cblearn.datasets import make_random_triplets
from sklearn.metrics import normalized_mutual_info_score
from plotting import AltairPlotter
from copy import deepcopy
from triplets import triplets_to_stochastic_matrix_cuts, unify_triplet_order, triplets_to_majority_neighbour_cuts, majority_neighbours_count_matrix
from sklearn.cluster import KMeans

In [2]:
seed=1
data = generate_gmm_data_fixed_means(n = 3, means = np.array([[-6, 3], [-6, -3], [6, 3]]), std=1, seed=seed)
p = AltairPlotter()
p.assignments(data.xs, data.ys)

In [4]:
triplets, responses = datasets.make_random_triplets(data.xs, result_format="list-boolean", size=50, random_state=10)
print(f"using {responses.size} triplets")
unified_triplets = unify_triplet_order(triplets, responses)

## tangles
# cuts = triplets_to_stochastic_matrix_cuts(unified_triplets, iterations=2, threshhold=1.5)
cuts = triplets_to_majority_neighbour_cuts(unified_triplets, radius=1.0, randomize_tie=False)
tangles = OrdinalTangles(agreement=2)
y_tangles = tangles.fit_predict(cuts)
print(f"NMI Tangles: {normalized_mutual_info_score(y_tangles, data.ys)} ({np.unique(y_tangles).sum()})")
soe_kmeans = SoeKmeans(embedding_dimension=2, n_clusters=3)
y_soe_kmeans = soe_kmeans.fit_predict(triplets, responses)
print(f"NMI SOE-Kmeans: {normalized_mutual_info_score(y_soe_kmeans, data.ys)}")
p.assignments(soe_kmeans.embedding_, data.ys) & p.assignments(data.xs, y_soe_kmeans)

using 50 triplets
NMI Tangles: 0.5895098274473048 (3)
NMI SOE-Kmeans: 1.0


In [42]:
triplets, responses = datasets.make_random_triplets(data.xs, result_format="list-boolean", size=100, random_state=10)
print(f"using {responses.size} triplets")
unified_triplets = unify_triplet_order(triplets, responses)

m = majority_neighbours_count_matrix(unified_triplets)
m = (m + 9)/18
m @ m > 2.9


using 100 triplets


array([[ True, False, False, False, False, False, False, False, False],
       [ True,  True,  True,  True, False, False, False, False, False],
       [ True,  True,  True,  True, False,  True,  True, False,  True],
       [ True, False, False,  True,  True,  True, False, False, False],
       [ True,  True,  True,  True,  True,  True, False, False, False],
       [False, False, False,  True,  True,  True, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False]])

In [None]:
m = majority_neighbours_count_matrix(unified_triplets) 
m += np.eye(m.shape[0]) * 6
m2 = np.zeros_like(m)
max_t = unified_triplets.max() + 1
for i in range(max_t):
    for j in range(max_t):
        similar = unified_triplets[np.logical_and(unified_triplets[:, 0] == i, unified_triplets[:, 1] == j)]
        similar_sum = 0
        dissimilar = unified_triplets[np.logical_and(unified_triplets[:, 0] == i, unified_triplets[:, 2] == j)]
        dissimilar_sum = 0
        for a, _, x in similar:
            similar_sum += m[a, x]
        for a, x, _ in dissimilar:  
            dissimilar_sum += m[a, x]
        m2[i, j] = m[i, j] + similar_sum - dissimilar_sum
m2
cuts = (m2 > 0)
tangles = OrdinalTangles(agreement=2)
y_tangles = tangles.fit_predict(cuts)
print(f"NMI Tangles: {normalized_mutual_info_score(y_tangles, data.ys)} ({np.unique(y_tangles).sum()})")

NMI Tangles: 0.6137465571428701 (3)


Not really working... a problem of majority neighbour cuts also seems that we are pretty reliant on which triplets are drawn and not very robust to that (more triplets can actually make use _worse_).

## SOE-Hybrid clustering

In [None]:
seed=1
data = generate_gmm_data_fixed_means(n = 30, means = np.array([[-6, 3], [-6, -3], [6, 3]]), std=1, seed=seed)
p = AltairPlotter()
p.assignments(data.xs, data.ys)

In [None]:
triplets, responses = datasets.make_random_triplets(data.xs, result_format="list-boolean", size=5000, random_state=10)
print(f"using {responses.size} triplets")
unified_triplets = unify_triplet_order(triplets, responses)

soe_kmeans = SoeKmeans(embedding_dimension=2, n_clusters=3)
y_soe_kmeans = soe_kmeans.fit_predict(triplets, responses)
print(f"NMI SOE-Kmeans: {normalized_mutual_info_score(y_soe_kmeans, data.ys)}")
p.assignments(soe_kmeans.embedding_, data.ys) & p.assignments(data.xs, y_soe_kmeans)

using 5000 triplets
NMI SOE-Kmeans: 1.0


In [None]:
## tangles
cuts = triplets_to_majority_neighbour_cuts(unified_triplets, radius=0.8, randomize_tie=False)
q = Questionnaire.from_metric(soe_kmeans.embedding_, density=0.01)
cuts = np.hstack((q.values, cuts))
tangles = OrdinalTangles(agreement=12)
y_tangles = tangles.fit_predict(q.values)
print(f"NMI Tangles: {normalized_mutual_info_score(y_tangles, data.ys)} ({np.unique(y_tangles).sum()})")

Generating questionnaire...
Generating question set...
Filling out questionnaire...


100%|██████████| 90/90 [00:00<00:00, 37234.89it/s]

NMI Tangles: 1.0 (3)



