# Clustering performance when $k$ is known in advance

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import KMeans
from tqdm import tqdm
import json

from models.baseline_model import ArticleBaselineModel
from models.tabularncd_model import TabularNCDModel
from models.NCD_Spectral_Clustering import *
from models.NCD_Kmeans import k_means_pp
from models.PBN_model import PBNModel
from src.dataset_utils import *
from src.utils import *

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = setup_device(use_cuda=True)

## Table of contents:
* [1. Clustering models](#1)
    * [1.1 $k$-means](#1.1)
    * [1.2 NCD $k$-means](#1.2)
    * [1.3 Spectral Clustering](#1.3)
    * [1.4 NCD Spectral Clustering](#1.4)
* [2. NCD models](#2)
    * [2.1 Baseline](#2.1)
    * [2.2 PBN](#2.2)
    * [2.3 TabularNCD](#2.3)

### Import the dataset

In [None]:
dataset_names = ['HumanActivityRecognition', 'LetterRecognition', 'Pendigits', 'USCensus1990', 'multiple_feature', 'optdigits', 'cnae_9']

dataset_name = 'LetterRecognition'

x_train, y_train, x_test, y_test, unknown_class_value, y_train_save, y_test_save = import_dataset_with_name(dataset_name)

In [None]:
x_train = torch.tensor(x_train, dtype=torch.float, device=device)
x_test = torch.tensor(x_test, dtype=torch.float, device=device)

# For plots
y_train_unknown_save = y_train_save[y_train == unknown_class_value]
y_test_unknown_save = y_test_save[y_test == unknown_class_value]

# For evaluation
y_train_unknown_save_codes = np.array(pd.Series(y_train_unknown_save).astype('category').cat.codes)
y_test_unknown_save_codes = np.array(pd.Series(y_test_unknown_save).astype('category').cat.codes)

In [None]:
plot_classes_distribution(y_train_save[y_train!=unknown_class_value], y_train_save[y_train==unknown_class_value], y_test_save, dataset_name)

In [None]:
x_train_known = x_train[y_train != unknown_class_value]
x_train_unknown = x_train[y_train == unknown_class_value]
y_train_known = y_train[y_train != unknown_class_value]

x_test_known = x_test[y_test != unknown_class_value]
x_test_unknown = x_test[y_test == unknown_class_value]
y_test_known = y_test[y_test != unknown_class_value]

# We need the targets to be in {0, ..., C^l} exactly
classifier_mapper, classifier_ind = np.unique(y_train_known, return_inverse=True)
classifier_mapping_dict = dict(zip(y_train_known, classifier_ind))

y_train_known = np.array(list(map(classifier_mapping_dict.get, y_train_known)))
y_test_known = np.array(list(map(classifier_mapping_dict.get, y_test_known)))

# 1) Clustering models <a class="anchor" id="1"></a>

#### /!\ In this notebook, we are using the ground truth number of clusters /!\

In [None]:
n_clusters = len(np.unique(y_test_unknown_save_codes))

## 1.1 $k$-means <a class="anchor" id="1.1"></a>

In [None]:
accs_1, nmis_1, aris_1 = [], [], []

for i in tqdm(range(10)):
    km = KMeans(n_clusters=n_clusters, init='random', n_init=10).fit(x_train_unknown.cpu().numpy())
    
    y_test_unknown_pred = km.predict(x_test_unknown.cpu().numpy())
    accs_1.append(hungarian_accuracy(y_test_unknown_save_codes, y_test_unknown_pred))
    nmis_1.append(normalized_mutual_info_score(y_test_unknown_save_codes, y_test_unknown_pred))
    aris_1.append(adjusted_rand_score(y_test_unknown_save_codes, y_test_unknown_pred))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_1)*100, np.std(accs_1)*100, np.mean(nmis_1)*100, np.std(nmis_1)*100, np.mean(aris_1)*100, np.std(aris_1)*100))

## 1.2 NCD $k$-means <a class="anchor" id="1.2"></a>

In [None]:
# For this method, we define the centroids of the known classes with ground truth as initial centroids
known_classes_centroids = torch.stack([x_train_known[y_train_known==c].mean(axis=0) for c in np.unique(y_train_known)])

centroid_to_class_dict = dict(enumerate(np.unique(y_train[y_train != unknown_class_value])))

In [None]:
accs_2, nmis_2, aris_2 = [], [], []

for i in tqdm(range(10)):
    kmpp = k_means_pp(pre_centroids=known_classes_centroids, k_new_centroids=n_clusters)
    kmpp.fit(x_train_unknown, tolerance=1e-4, n_iterations=300)
    
    y_test_unknown_pred = kmpp.predict_unknown_data(x_test_unknown).cpu().numpy()
    accs_2.append(hungarian_accuracy(y_test_unknown_save_codes, y_test_unknown_pred))
    nmis_2.append(normalized_mutual_info_score(y_test_unknown_save_codes, y_test_unknown_pred))
    aris_2.append(adjusted_rand_score(y_test_unknown_save_codes, y_test_unknown_pred))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_2)*100, np.std(accs_2)*100, np.mean(nmis_2)*100, np.std(nmis_2)*100, np.mean(aris_2)*100, np.std(aris_2)*100))

## 1.3 Spectral Clustering <a class="anchor" id="1.3"></a>

In [None]:
accs_3, nmis_3, aris_3 = [], [], []
for i in tqdm(range(10)):
    sc = ncd_spectral_clustering(n_new_clusters=len(np.unique(y_train_unknown_save)), min_dist=0.6)
    test_pred = sc.fit_predict_simple(x_test_unknown)
    
    accs_3.append(hungarian_accuracy(test_pred, y_test_unknown_save_codes))
    nmis_3.append(normalized_mutual_info_score(test_pred, y_test_unknown_save_codes))
    aris_3.append(adjusted_rand_score(test_pred, y_test_unknown_save_codes))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_3)*100, np.std(accs_3)*100, np.mean(nmis_3)*100, np.std(nmis_3)*100, np.mean(aris_3)*100, np.std(aris_3)*100))

## 1.4 NCD Spectral Clustering <a class="anchor" id="1.4"></a>

In [None]:
# We load the hyperparameters that were optimized through grid search
d = json.load(open("hyperparameters.json"))
ncdsc_params = d["NCD SC"]["GT k"][dataset_name]
print("Using hyperparameters:", ncdsc_params)

In [None]:
x_test_full = torch.concat([x_test_known, x_test_unknown], axis=0)

# Get the spectral embedding for all the data (since the hyperparameters are adapted for all the data, not only the novel data)
full_spectral_embedding = get_spectral_embedding(x_test_full, ncdsc_params['n_components'], ncdsc_params['min_dist'])

In [None]:
accs_4, nmis_4, aris_4 = [], [], []
for i in tqdm(range(10)):
    kmpp = k_means_pp(pre_centroids=None, k_new_centroids=len(np.unique(y_train_unknown_save)))
    kmpp.fit(full_spectral_embedding[len(x_test_known):], tolerance=1e-10, n_iterations=1000, n_init=10)
    y_test_unknown_pred = kmpp.predict_unknown_data(full_spectral_embedding[len(x_test_known):]).cpu().numpy()

    accs_4.append(hungarian_accuracy(y_test_unknown_save_codes, y_test_unknown_pred))
    nmis_4.append(normalized_mutual_info_score(y_test_unknown_save_codes, y_test_unknown_pred))
    aris_4.append(adjusted_rand_score(y_test_unknown_save_codes, y_test_unknown_pred))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_4)*100, np.std(accs_4)*100, np.mean(nmis_4)*100, np.std(nmis_4)*100, np.mean(aris_4)*100, np.std(aris_4)*100))

# 2) NCD models <a class="anchor" id="2"></a>

## 2.1 Baseline <a class="anchor" id="2.1"></a>

In [None]:
base_config = {
    'input_size': x_train.shape[1],
    'hidden_layers_dims': [math.floor(3*x_train.shape[1]/4), math.floor(2*x_train.shape[1]/4)],
    'activation_fct': 'relu',  # relu or sigmoid or tanh or None
    'use_batchnorm': True,  # True or False
    'use_norm': 'l2',  # None or 'l1' or 'l2'
    
    'n_classes': len(np.unique(y_train_known)),
    'n_clusters': len(np.unique(y_train_unknown_save)),
    
    'clustering_model': 'kmeans',  # kmeans or ncd_kmeans or spectral_clustering or ncd_spectral_clustering
    'clustering_runs': 1,  # To compute the average accuracy of the clustering
    
    'batch_size': 512,
    'epochs': 200,
}

In [None]:
# We load the hyperparameters that were optimized through grid search
d = json.load(open("hyperparameters.json"))
config = d["Baseline"]["GT k"][dataset_name]
print("Using hyperparameters:", config)

b_config = base_config.copy()
b_config.update(config)

In [None]:
accs_5, nmis_5, aris_5 = [], [], []

for i in tqdm(range(10)):
    model = ArticleBaselineModel(b_config).to(device)
    losses_dict = model.train_on_known_classes(x_train_known=x_train_known, y_train_known=y_train_known,
                                               x_test_unknown=x_test_unknown, y_test_unknown=y_test_unknown_save_codes,
                                               x_test_known=x_test_known, y_test_known=y_test_known,
                                               batch_size=b_config['batch_size'], lr=b_config['lr'], epochs=b_config['epochs'], n_clusters=b_config['n_clusters'], clustering_runs=b_config['clustering_runs'],
                                               evaluate=False, disable_tqdm=True)
    model.eval()
    
    preds = [np.array(model.predict_new_data(b_config['n_clusters'], x_test_unknown)) for _ in range(5)]
    accs_5.append(np.mean([hungarian_accuracy(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))
    nmis_5.append(np.mean([normalized_mutual_info_score(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))
    aris_5.append(np.mean([adjusted_rand_score(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_5)*100, np.std(accs_5)*100, np.mean(nmis_5)*100, np.std(nmis_5)*100, np.mean(aris_5)*100, np.std(aris_5)*100))

## 2.2 PBN <a class="anchor" id="2.2"></a>

In [None]:
base_config = {
    'input_size': x_train.shape[1],
    'hidden_layers_dims': [math.floor(3*x_train.shape[1]/4), math.floor(2*x_train.shape[1]/4)],
    'activation_fct': 'relu',  # relu or sigmoid or tanh or None
    'use_batchnorm': True,  # True or False
    'use_norm': 'l2',  # None or 'l1' or 'l2'
    
    'n_classes': len(np.unique(y_train_known)),
    
    'clustering_model': 'kmeans',  # kmeans or ncd_kmeans or spectral_clustering or ncd_spectral_clustering
    'clustering_runs': 1,  # To compute the average accuracy of the clustering
    
    'batch_size': 512,
    'epochs': 200,
}

In [None]:
# We load the hyperparameters that were optimized through grid search
d = json.load(open("hyperparameters.json"))
config = d["PBN"]["GT k"][dataset_name]
print("Using hyperparameters:", config)

pbn_config = base_config.copy()
pbn_config.update(config)

In [None]:
accs_6, nmis_6, aris_6 = [], [], []

for i in tqdm(range(10)):
    model = PBNModel(pbn_config).to(device)
    losses_dict = model.train_on_known_classes(x_train=x_train, y_train=y_train, unknown_class_value=unknown_class_value, x_test_unknown=x_test_unknown, y_test_unknown=y_test_unknown_save_codes, x_test_known=x_test_known, y_test_known=y_test_known,
                                               batch_size=pbn_config['batch_size'], lr=pbn_config['lr'], epochs=pbn_config['epochs'], clustering_runs=pbn_config['clustering_runs'], w=pbn_config['w'],
                                               evaluate=False, disable_tqdm=True)
    model.eval()
    
    preds = [np.array(model.predict_new_data(n_clusters=n_clusters, x_unknown=x_test_unknown)) for _ in range(5)]
    accs_6.append(np.mean([hungarian_accuracy(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))
    nmis_6.append(np.mean([normalized_mutual_info_score(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))
    aris_6.append(np.mean([adjusted_rand_score(pred, np.array(y_test_unknown_save_codes)) for pred in preds]))

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_6)*100, np.std(accs_6)*100, np.mean(nmis_6)*100, np.std(nmis_6)*100, np.mean(aris_6)*100, np.std(aris_6)*100))

## 2.3 TabularNCD <a class="anchor" id="2.3"></a>

In [None]:
base_config = {
    'hidden_layers_dims': [math.floor(3*x_train.shape[1]/4), math.floor(2*x_train.shape[1]/4)],
    'input_size': x_train.shape[1],
    'n_known_classes': len(np.unique(y_train)),  # Takes into account the unknown class
    'n_unknown_classes': len(np.unique(y_train_unknown_save)),
    'activation_fct': 'relu',
    'use_batchnorm': True,
    'batch_size': 512,
    'epochs': 200,
    
    'M': 2000,
}

In [None]:
# We load the hyperparameters that were optimized through grid search
d = json.load(open("hyperparameters.json"))
config = d["TabularNCD"]["GT k"][dataset_name]
print("Using hyperparameters:", config)

tncd_config = base_config.copy()
tncd_config.update(config)

In [None]:
accs_7, nmis_7, aris_7 = [], [], []

for i in tqdm(range(10)):
    model = TabularNCDModel(tncd_config).to(device)
    losses_dict = model.joint_training(config=tncd_config,
                                       x_train=x_train, y_train=y_train,
                                       x_test_known=x_test_known, y_test_known=y_test_known,
                                       x_test_unknown=x_test_unknown, y_test_unknown=y_test_unknown_save_codes,
                                       y_train_unknown=y_train_unknown_save_codes,
                                       unknown_class_value=unknown_class_value,
                                       disable_tqdm=True)
    
    accs_7.append(losses_dict["test cluster ACC"][-1])
    nmis_7.append(losses_dict["test cluster NMI"][-1])
    aris_7.append(losses_dict["test cluster ARI"][-1])

In [None]:
print("TEST: ACC={:.1f}±{:.1f} | NMI={:.1f}±{:.1f} | ARI={:.1f}±{:.1f}".format(np.mean(accs_7)*100, np.std(accs_7)*100, np.mean(nmis_7)*100, np.std(nmis_7)*100, np.mean(aris_7)*100, np.std(aris_7)*100))

In [None]:
dataset_name