In [None]:
import os
import numpy as np
from numpy.random import default_rng

import scipy.spatial.distance as dist
import matplotlib as mpl
import matplotlib.pyplot as plt
import gudhi

import IBloFunMatch_inter as ibfm

_tol = 1e-12

output_dir = "output" # Name of directory to communicate with C++ program

Make sure that the folder to store the plots exist, or create it if necessary.

In [None]:
if not os.path.exists("plots/iris"):
    os.makedirs("plots/iris")

In [None]:
def get_IBloFunMatch_output_range(data, y, S_list, yS_list, NUM_class, NUM_subset):
    IBloFunMatch_output = [] # Store all output here 
    # Buffer files to write subsets and classes for communicating with C++ program 
    # f_ind_sampl = output_dir + "\\indices_sample.out"
    # f_dist_X = output_dir + "\\dist_X.out"
    # f_dist_S = output_dir + "\\dist_S.out"
    for idx_class in range(NUM_class):
        for idx_subset in range(NUM_subset):
            print(f"Class: {idx_class:5d}, Subset:{idx_subset:5d}")
            print( "============================")
            output_data = {}
            # Subset and dataset points pertaining to class 
            subset = S_list[idx_subset]
            y_subset = yS_list[idx_subset]
            S = subset[y_subset==idx_class]
            X = data[y==idx_class]
            output_data["S"]=S
            output_data["X"]=X
            # Indices of points from S within X and save
            idS = [np.argmax(np.sum(abs(X - pt), axis=1) < _tol) for pt in S]
            output_data["idS"]=idS
            # np.savetxt(f_ind_sampl, idS, fmt="%d", newline="\n")
            # Compute distance matrices and save
            Dist_X = dist.squareform(dist.pdist(X))
            Dist_S = dist.squareform(dist.pdist(S))
            print(f"idS: {len(idS)}")
            print(f"Dist_S.shape: {Dist_S.shape}")
            print(f"Dist_X.shape: {Dist_X.shape}")
            output_data_ibfm = ibfm.get_IBloFunMatch_subset(Dist_S, Dist_X, idS, output_dir)
            for key in output_data_ibfm.keys():
                output_data[key] = output_data_ibfm[key]
            # end for
            IBloFunMatch_output.append(output_data)
            print("DONE--------------------------------")
        # subset range 
    # class range  
    return IBloFunMatch_output
# def get_IBloFunMatch_output_range

In [None]:
from sklearn import datasets

iris = datasets.load_iris()

Take out duplicates (otherwise code does not work properly)

In [None]:
data = iris["data"]
data, index_unique = np.unique(data, axis=0, return_index=True)
y = iris["target"][index_unique]

Compute matchings of dataset with itself across the three classes.

In [None]:
S_list = [data]
yS_list = [y]
NUM_class = 3
NUM_subset = 1
IBloFunMatch_total = get_IBloFunMatch_output_range(data, y, S_list, yS_list, NUM_class, NUM_subset)

In [None]:
IBloFunMatch_total[0].keys()

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=3, figsize=(8,6))
ibfm.plot_matching(IBloFunMatch_total[0], output_dir, ax[0], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True)
ibfm.plot_matching(IBloFunMatch_total[1], output_dir, ax[1], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True)
ibfm.plot_matching(IBloFunMatch_total[2], output_dir, ax[2], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True)
plt.savefig("plots/iris/matching_X_1.png")

Do the same in dimension 0

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=3, figsize=(8,12))
ibfm.plot_matching(IBloFunMatch_total[0], output_dir, ax[0], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True, dim=0)
ibfm.plot_matching(IBloFunMatch_total[1], output_dir, ax[1], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True, dim=0)
ibfm.plot_matching(IBloFunMatch_total[2], output_dir, ax[2], fig, max_rad=-1, colorbars=["orange", "aquamarine"], frame_on=True, dim=0)
plt.savefig("plots/iris/matching_X_0.png")

Take 10 subsets equaly sampling over each class.

In [None]:
rng = default_rng(5)
PERCENT = 0.5
NUM_subset = 50
NUM_class = 3
S_list = []
yS_list = []
S_list_indices = []
for idx_sub in range(NUM_subset):
    subset_data = []
    subset_y = []
    indices_subset = []
    for idx_class in range(NUM_class):
        idx_choice = list(np.nonzero(y==idx_class)[0])
        indices_subset += list(rng.choice(idx_choice, replace=False, size=int(len(idx_choice)*PERCENT)))
    # end for
    S_list.append(data[indices_subset])
    yS_list.append(y[indices_subset])
    S_list_indices.append(indices_subset)

See the shape of each subset.

In [None]:
S_list[0].shape

Compute block function for each subset.

In [None]:
%%capture
IBloFunMatch_o = get_IBloFunMatch_output_range(data, y, S_list, yS_list, NUM_class, NUM_subset)

Compute matching scores on both dimensions. This leads to 2 dimensions, N subsets and 3 classes.

In [None]:
S_match_scores_dim = []
for dim in range(2):
    S_match_scores = []
    for idx_sub in range(NUM_subset):
        matching_values = []
        for idx_class in range(NUM_class):
            valid_strengths = IBloFunMatch_o[idx_class*NUM_subset + idx_sub][f"matching_strengths_{dim}"][
                IBloFunMatch_o[idx_class*NUM_subset + idx_sub][f"matching_strengths_{dim}"]>0]
            matching_values.append(sum(valid_strengths))
        # end for 
        S_match_scores.append(matching_values)
    # for over subsets
    S_match_scores_dim.append(S_match_scores)
# range over dimensions 0 and 1
S_match_scores_dim = np.array(S_match_scores_dim)
S_match_scores_dim.shape

Store the scores of the large dataset $X$ with itself.

In [None]:
X_scores = [] 
for dim in range(2):
    X_scores.append([
        sum(IBloFunMatch_total[idx_class][f"matching_strengths_{dim}"][
                IBloFunMatch_total[idx_class][f"matching_strengths_{dim}"]>0    
            ]) for idx_class in range(NUM_class)
    ])
# end dim=0,1
for dim in range(2):
    print(f"X_scores dim {dim}: {X_scores[dim]}")


We plot now the scores across all samples.

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(10, 5))
class_colors = ["orange", "blue", "green"]
for dim in range(2):
    for idx_class in range(NUM_class):
        subset_indices = list(range(NUM_subset))
        ax[dim].plot(subset_indices, S_match_scores_dim[dim][:,idx_class], c=class_colors[idx_class], label=f"class {idx_class}")
        ax[dim].plot(subset_indices, [X_scores[dim][idx_class] for i in subset_indices], c=class_colors[idx_class])
    # for over classes
    ax[dim].legend(loc="upper left")
    ax[dim].set_title(f"Dimension {dim}")
# for dim=0,1
plt.savefig("plots/iris/matching_sums_subsets.png")

We scale the scores and take weighted means over dimensions as the scores for each class sample.

In [None]:
w = [1, 2]
S_match_scores_dim_scaled = [
    (S_match_scores_dim[0]/X_scores[0])*w[0],
    (S_match_scores_dim[1]/X_scores[1])*w[1]    
] 
# S_match_scores_C = np.mean(np.array(S_match_scores_dim_scaled), axis=0)
S_match_scores_C = np.max(np.array(S_match_scores_dim_scaled), axis=0)
S_match_scores_C.shape

Now, take samples from worse to better one each class score and store into different datasets.

In [None]:
S_list_s = [] 
yS_list_s = [] 
S_list_s_indices = []
for idx_class in range(NUM_class):
    for idx_sub in S_match_scores_C[:,idx_class].argsort():
        indices_class = yS_list[idx_sub]==idx_class
        if idx_class==0:
            S_list_s.append(S_list[idx_sub][indices_class])
            yS_list_s.append(yS_list[idx_sub][indices_class])
            S_list_s_indices.append(list(np.array(S_list_indices[idx_sub])[indices_class]))
        else:
            S_list_s[idx_sub] = np.vstack((S_list_s[idx_sub], S_list[idx_sub][indices_class]))
            yS_list_s[idx_sub] = np.hstack((yS_list_s[idx_sub], yS_list[idx_sub][indices_class]))
            S_list_s_indices[idx_sub] += list(np.array(S_list_indices[idx_sub])[indices_class])

Reorder old matching scores so that these fit the new samples.

In [None]:
S_match_s_scores_dim = []
for dim in range(2):
    S_match_s_scores = np.zeros(S_match_scores_dim[dim].shape)
    for idx_class in range(NUM_class):
        class_score_sort = S_match_scores_C[:,idx_class].argsort()
        for idx_sub in range(NUM_subset):
            S_match_s_scores[idx_sub, idx_class] = S_match_scores_dim[dim][class_score_sort[idx_sub], idx_class]
        # end for 
    # for over subsets
    S_match_s_scores_dim.append(S_match_s_scores)
# for dim=0,1

 Plot matching scores again.

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(10, 5))
class_colors = ["orange", "blue", "green"]
for dim in range(2):
    for idx_class in range(NUM_class):
        subset_indices = list(range(NUM_subset))
        ax[dim].plot(subset_indices, S_match_s_scores_dim[dim][:,idx_class], c=class_colors[idx_class], label=f"class {idx_class}")
        ax[dim].plot(subset_indices, [X_scores[dim][idx_class] for i in subset_indices], c=class_colors[idx_class])
    # for over classes
    ax[dim].legend(loc="upper left")
    ax[dim].set_title(f"Dimension {dim}")
# for dim=0,1
plt.savefig("plots/iris/matching_sums_subsets.png")

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
%%capture
training_scores = []
for idx_sub in range(NUM_subset):
    w_data = S_list_s[idx_sub] 
    w_y = yS_list_s[idx_sub]
    clf = MLPClassifier(random_state=1, max_iter=300).fit(w_data, w_y)
    test_idx = [i for i in range(data.shape[0]) if i not in S_list_s_indices[idx_sub]]
    y_test = y[test_idx]
    test = data[test_idx]
    training_scores.append(clf.score(test, y_test))

In [None]:
print(f"Training scores range:{min(training_scores), max(training_scores)}")

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(10, 5))
class_colors = ["orange", "blue", "green"]
for dim in range(2):
    for idx_class in range(NUM_class):
        subset_indices = list(range(NUM_subset))
        ax[dim].plot(subset_indices, S_match_s_scores_dim[dim][:,idx_class], c=class_colors[idx_class], label=f"class {idx_class}")
        ax[dim].plot(subset_indices, [X_scores[dim][idx_class] for i in subset_indices], c=class_colors[idx_class])
    # plot training scores 
    ax[dim].plot(subset_indices, np.array(training_scores)*max(ax[dim].get_ylim()), c="red", label="train score")
    ax[dim].plot(subset_indices, np.ones(len(subset_indices))*max(ax[dim].get_ylim()), "--", c="red", label="1.0 train score")
    # for over classes
    ax[dim].legend(loc="upper left")
    ax[dim].set_title(f"Dimension {dim}")
# for dim=0,1
plt.savefig("plots/iris/matching_sums_0_1.png")

TO DO: Plot also the matchings concerning all subset data.