In [1]:
import numpy as np
import random
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import os

In [2]:
embeddings = np.load(r"C:\Users\Florian Moga\Desktop\Code\numpy_embeddings.npy")
classes, samples, vector_size = embeddings.shape
class_labels = np.empty((classes, samples), dtype=int)
for i in range(len(class_labels)):
    class_labels[i] = np.full(samples, i)

class_sample_combination = [(x, y) for x in range(classes) for y in range(samples)]

In [3]:
def choose_n_classes(n):
    n_embeddings = np.empty((classes, samples, vector_size))
    n_class_labels = np.empty((n, samples), dtype=int)

    class_range = list(range(classes))
    n_classes = []

    i = 0
    while i < n:
        random_class = random.choice(class_range)
        class_range.remove(random_class)
        n_classes.append(random_class)

        n_embeddings[random_class] = embeddings[random_class]
        n_class_labels[i] = class_labels[random_class]

        i+=1

    n_class_sample_combination = [(x, y) for x in n_classes for y in range(samples)]

    return n_embeddings, n_class_labels, n_class_sample_combination

In [4]:
def cosine_sim(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    norm_ab = norm_a * norm_b
    if norm_ab < 1e-8:
        norm_ab = 1e-8
    return np.dot(a, b)/norm_ab

In [5]:
def get_mean(arr):
    return(np.mean(arr, axis=0))

In [6]:
def cluster_no_finetuning(thresh, choose_n = False, n=0):

    if choose_n != False:
        n_embeddings, _, n_class_sample_combination = choose_n_classes(n)
        sample_combination = n_class_sample_combination.copy()
    else:
        sample_combination = class_sample_combination.copy()

    cluster_dict = {}
    vector_dict = {}
    class_num = 0

    # get random first sample
    first = random.choice(sample_combination)
    # remove it from the combinations
    sample_combination.remove(first)

    # get class from sample
    sample_class, sample_position = first

    # create a dict from the first sample
    cluster_dict[class_num] = [sample_class]

    # create a dict which will contain the clustered vector embeddings, it will be used to calculate the class mean
    vector_dict[class_num] = [embeddings[sample_class, sample_position]]

    while sample_combination != []:

        random_cls, random_pos = random.choice(sample_combination)
        sample_combination.remove((random_cls, random_pos))

        random_sample = embeddings[random_cls, random_pos]

        biggest_sim = 0
        # iterate the present clusters to choose the best one
        for key in vector_dict:
            cls_mean = get_mean(vector_dict[key])

            # compare the sample with all clusters mean
            cls_sim = cosine_sim(random_sample, cls_mean)

            # if bigger than the actual, update the biggest similarity and change the biggest similarity cluster
            if cls_sim > biggest_sim:
                biggest_sim = cls_sim
                best_cluster = key

        # if the biggest similarity does not exceed the threshold, create new cluster with that sample
        if biggest_sim < thresh:
            class_num += 1
            cluster_dict[class_num] = [random_cls]
            vector_dict[class_num] = [embeddings[random_cls, random_pos]]

        # if the biggest similarity exceeds the threshold, add it to the best cluster
        else:
            cluster_dict[best_cluster].append(random_cls)
            vector_dict[best_cluster].append(embeddings[random_cls, random_pos])

    return cluster_dict, vector_dict

In [7]:
def get_labels_from_clustering(cluster_dict):
    labels_true, labels_pred = [], []
    for key in cluster_dict:
        first_value = cluster_dict[key][0]
        labels_true += ([first_value] * len(cluster_dict[key]))
        labels_pred += (cluster_dict[key])

    return adjusted_rand_score(labels_true, labels_pred)

In [None]:
cd, vd = cluster_no_finetuning(0.96, choose_n = False, n = 10)

In [None]:
def compute_nmi_ari(clusterings):
    labels_dict = {}
    labels_dict_present = {}
    num=0

    for cluster in clusterings:
        values = clusterings[cluster]

        if str(values[0]) not in labels_dict_present.keys():
            labels_dict_present[str(values[0])] = clusterings[cluster]
        else:
            labels_dict_present[str(values[0])+f"_{num}"] = clusterings[cluster]
            num+=1
            
    for cluster in clusterings:
        values = clusterings[cluster]

        for value in values:
            if value not in labels_dict.keys():
                labels_dict[value] = []
                
    bad_cl = 350
    false_cluster = False
    for key in labels_dict_present:
        values = labels_dict_present[key]
        false_cluster = False
        for value in values:
            if len(key.split("_")) == 2:
                labels_dict[value].append(bad_cl)
                false_cluster = True
            else:
                labels_dict[value].append(key)

        if false_cluster:
            bad_cl+=1
            
    
    labels = [[x]*10 for x in labels_dict.keys()]
    labels = [item for sublist in labels for item in sublist]

    preds = [labels_dict[x] for x in labels_dict]
    preds = [item for sublist in preds for item in sublist]
    
    ari = adjusted_rand_score(labels, preds)
    nmi = normalized_mutual_info_score(labels, preds)
    
    return nmi, ari

In [None]:
cluster_thresholds = [0.75, 0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84, 0.85, 0.9]
cluster_classes = [10, 50, 100, 338]

for t in cluster_thresholds:
    for c in cluster_classes:
        
        list_nmi, list_ari = [], []
        for _ in range(10):
            cd, _ = cluster_no_finetuning(t, choose_n = True, n = c)
            nmi, ari = compute_nmi_ari(cd)
            list_nmi.append(nmi)
            list_ari.append(ari)
                        
        print(f"Metrics for threshold {t} and for {c} classes") 
        print(f"mean nmi {np.mean(list_nmi)} +- {np.std(list_nmi)}")
        print(f"mean ari {np.mean(list_ari)} +- {np.std(list_ari)}")

In [None]:
# loading

image_folder_files = os.listdir(main_folder)
image_folder_files = [img.split('.')[0] for img in image_folder_files]
image_folder_files.sort()

f_dict = {}
for file_class in os.listdir(main_folder):
    f_dict[file_class] = []

with open('test.npy', 'rb') as f:
    for key in f_dict:
        f_dict[key] = np.load(f)

In [None]:
# saving

f_dict = dict(sorted(f_dict.items()))

with open('test.npy', 'wb') as f:
    for key in f_dict:
        np.save(f, f_dict[key])

In [None]:
# new class

new_class = np.empty((0, 2208))
new_class = np.append(new_class, np.array([class_embedding]), axis = 0)