# Other cost functions
To make proofs easier (and to maybe find a more efficient cost function),
we might want to think about other feasible cost functions for triplet-tangles.

In [1]:
import sys
sys.path.append("..")
from data_generation import generate_gmm_data_fixed_means
import numpy as np
import seaborn as sns
from sklearn.metrics import pairwise_distances, normalized_mutual_info_score
import matplotlib.pyplot as plt
from estimators import SoeKmeans, OrdinalTangles
from questionnaire import Questionnaire
from triplets import triplets_to_majority_neighbour_cuts, unify_triplet_order
from cblearn.datasets import make_all_triplets, make_random_triplets
from plotting import AltairPlotter

In [2]:
def L(triplets, x):
    return triplets[:, 0] == x
def M(triplets, x):
    return triplets[:, 1] == x
def R(triplets, x):
    return triplets[:, 1] == x

def coherence(triplets, cut):
    in_cut = list(np.argwhere(cut).flatten())
    out_cut = list(np.argwhere(~cut).flatten())
    total = 0
    for v in in_cut:
        for w in out_cut:
            total += (L(triplets, v) & R(triplets, w)).sum() + (L(triplets, w) & R(triplets, w)).sum()
    return total / (len(in_cut) * (cut.size - len(out_cut)))
    

In [3]:
import numpy as np
t = np.array([[1,2,3], [3,4,5]])
list(np.argwhere([True, True, False]).flatten())

[0, 1]

In [4]:
data = generate_gmm_data_fixed_means(n=15, means=np.array([[-1, 0], [1,0]]), std=0.6, seed=1)
p = AltairPlotter()
p.assignments(data.xs, data.ys)

In [127]:
#triplets = unify_triplet_order(*make_random_triplets(data.xs, "list-boolean", size=5000))
triplets = unify_triplet_order(*make_all_triplets(data.xs, "list-boolean"))
cuts = triplets_to_majority_neighbour_cuts(triplets)
tangles = OrdinalTangles(4)
ys = tangles.fit_predict(cuts, data.ys)
#p.assignments(data.xs, cuts[4])
print(normalized_mutual_info_score(ys, data.ys))

0.8214296888424342


In [6]:
#triplets = unify_triplet_order(*make_all_triplets(data.xs, "list-boolean"))
cuts = triplets_to_majority_neighbour_cuts(triplets)
tangles = OrdinalTangles(4)
ys = tangles.predict(cuts, cost_function=lambda x: coherence(triplets, x))
print(normalized_mutual_info_score(ys, data.ys))

0.5373894811943651


As we can see, the new function performs rather poorly. Sad.