In [1]:
import torch as th
import os
import numpy as np
from tqdm.notebook import tqdm
from sklearn import metrics
from matplotlib import pyplot as plt

np.random.seed(32635)

In [2]:
def cosine_distance_torch(x1, x2=None):
    x2 = x1 if x2 is None else x2
    # w1 = x1.norm(p=2, dim=1, keepdim=True)
    # w2 = w1 if x2 is x1 else x2.norm(p=2, dim=1, keepdim=True)
    return th.mm(x1, x2.t())


class_dct = {   7: ('road', 6111042),
                8: ('sidewalk', 1016635),
                11: ('building', 3822901),
                12: ('wall', 109501),
                13: ('fence', 146414),
                17: ('pole', 203783),
                19: ('traffic light', 34697),
                20: ('traffic sign', 91885),
                21: ('vegetation', 2665158),
                22: ('terrain', 193395),
                23: ('sky', 680522),
                24: ('person', 202630),
                25: ('rider', 22368),
                26: ('car', 1165026),
                27: ('truck', 44584),
                28: ('bus', 38923),
                31: ('train', 38767),
                32: ('motorcycle', 16403),
                33: ('bicycle', 69008)}


device='cpu'

"""
Author: Josue N Rivera (github.com/JosueCom)
Date: 7/3/2021
Description: Snippet of various clustering implementations only using PyTorch
Full project repository: https://github.com/JosueCom/Lign (A graph deep learning framework that works alongside PyTorch)
"""

def randomize_tensor(tensor):
    return tensor[th.randperm(len(tensor))]

def distance_matrix(x, y=None, p = 2): #pairwise distance of vectors
    
    y = x if type(y) == type(None) else y

    n = x.size(0)
    m = y.size(0)
    d = x.size(1)

    # x = x.unsqueeze(1).expand(n, m, d)
    # y = y.unsqueeze(0).expand(n, m, d)
    
    # dist = th.linalg.vector_norm(x - y, p, 2) if th.__version__ >= '1.7.0' else th.pow(x - y, p).sum(2)**(1/p)
    dist = cosine_distance_torch(x, y)
    return dist

In [3]:
'/mnt/lwll/lwll-coral/hrant/cs_patches_256/'

'/mnt/lwll/lwll-coral/hrant/cs_patches_256/'

In [3]:
model = 'dino'
path_to_read = '/mnt/lwll/lwll-coral/hrant/cs_patches_256/'
dino_embeds_train = th.from_numpy(np.load(path_to_read + f'{model}_embeds_train.npy')).to(device=device)
dino_labels_train = th.from_numpy(np.load(path_to_read + f'{model}_labels_train.npy')).to(device=device, dtype=th.int64)
dino_embeds_val = th.from_numpy(np.load(path_to_read + f'{model}_embeds_test.npy')).to(device=device)
dino_labels_val = th.from_numpy(np.load(path_to_read + f'{model}_labels_test.npy')).to(device=device, dtype=th.int64)
# dino_embeds_train = th.from_numpy(np.load(path_to_read + f'embeds.npy')).to(device=device)
# dino_labels_train = th.from_numpy(np.load(path_to_read + f'labels.npy')).to(device=device, dtype=th.int64)
# dino_embeds_val = th.from_numpy(np.load(path_to_read + f'embeds_val_2000.npy')).to(device=device)
# dino_labels_val = th.from_numpy(np.load(path_to_read + f'labels_val_2000.npy')).to(device=device, dtype=th.int64)
path = '/mnt/lwll/lwll-coral/hrant/cs_patches_256/predictions_knn/'


In [4]:
model = 'sup_vit'
path_to_read = '/mnt/lwll/lwll-coral/hrant/cs_patches_256/'
dino_embeds_train = th.from_numpy(np.load(path_to_read + f'{model}_embeds_train.npy')).to(device=device)
dino_labels_train = th.from_numpy(np.load(path_to_read + f'{model}_labels_train.npy')).to(device=device, dtype=th.int64)
dino_embeds_val = th.from_numpy(np.load(path_to_read + f'{model}_embeds_val.npy')).to(device=device)
dino_labels_val = th.from_numpy(np.load(path_to_read + f'{model}_labels_val.npy')).to(device=device, dtype=th.int64)


In [5]:
with open('preprocessing/labels.txt') as f:
    labels = {
        int(line[22:25].strip()):
        (line[:22].strip().replace("'", ""), 
         int(line[25:].strip()))
        for line in f.readlines()
    }

eval_labels = [i for i in labels if 0 <= labels[i][1] < 255]

eval_label_names = [labels[i][0] for i in labels if 0 <= labels[i][1] < 255]

In [6]:
eval_labels

[7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33]

In [7]:
eval_label_names

['road',
 'sidewalk',
 'building',
 'wall',
 'fence',
 'pole',
 'traffic light',
 'traffic sign',
 'vegetation',
 'terrain',
 'sky',
 'person',
 'rider',
 'car',
 'truck',
 'bus',
 'train',
 'motorcycle',
 'bicycle']

In [8]:
class NN():

    def __init__(self, X = None, Y = None, p = 2):
        self.p = p
        self.train(X, Y)

    def train(self, X, Y):
        self.train_pts = X
        self.train_label = Y

    def __call__(self, x):
        return self.predict(x)

    def predict(self, x):
        if type(self.train_pts) == type(None) or type(self.train_label) == type(None):
            name = self.__class__.__name__
            raise RuntimeError(f"{name} wasn't trained. Need to execute {name}.train() first")
        
        dist = distance_matrix(x, self.train_pts, self.p)
        labels = th.argmin(dist, dim=1)
        return self.train_label[labels]

In [9]:
class KNN(NN):

    def __init__(self, X = None, Y = None, k = 3, p = 2):
        self.k = k
        super().__init__(X, Y, p)
    
    def train(self, X, Y):
        super().train(X, Y)
        if type(Y) != type(None):
            self.unique_labels = self.train_label.unique()

    def predict(self, x):
        if type(self.train_pts) == type(None) or type(self.train_label) == type(None):
            name = self.__class__.__name__
            raise RuntimeError(f"{name} wasn't trained. Need to execute {name}.train() first")
        
        # candidates = []
        size = 10000
        # top_n = th.zeros((x.shape[0], self.train_pts.shape[0]), dtype=th.float16)
        # top_n_dct = {'indices':[], 'values':[]}
        # candidates = [top_n_dct] * x.shape[0]
        index_topn = []
        value_topn = []

        for i in tqdm(range(len(self.train_pts)//size)):
        # dist = distance_matrix(x, self.train_pts, self.p)
            try:
                dist = distance_matrix(x, self.train_pts[i*size:(i+1)*size], self.p)
            except IndexError:
                dist = distance_matrix(x, self.train_pts[i*size:], self.p)
            
            
            knn = dist.topk(self.k, largest=True)#.values
            indices = knn.indices + i*size
            
            index_topn.append(indices)
            value_topn.append(knn.values)
#             if i == 5:
#                 break

        return index_topn, value_topn
                # top_n[i][indices[i]] = knn.values[i]
#         index_topn = th.tensor(index_topn)
#         value_topn = th.tensor(value_topn)
#         candidates = top_n.topk(self.k, largest=True).indices
#         candidates = index_topn[candidates]
#         votes = self.train_label[candidates]

#         winner = th.zeros(votes.size(0), dtype=votes.dtype, device=votes.device)
#         count = th.zeros(votes.size(0), dtype=votes.dtype, device=votes.device) - 1

#         for lab in self.unique_labels:
#             vote_count = (votes == lab).sum(1)
#             who = vote_count >= count
#             winner[who] = lab
#             count[who] = vote_count[who]

#         return winner

In [10]:
dino_embeds_train.shape, dino_labels_train.shape, dino_embeds_val.shape

(torch.Size([18659200, 1024]),
 torch.Size([18659200]),
 torch.Size([3136000, 1024]))

In [11]:
knn = KNN(dino_embeds_train, dino_labels_train, k=10, p=2)

In [None]:
out1, out2 = knn(dino_embeds_val)

  0%|          | 0/1865 [00:00<?, ?it/s]

In [14]:
good_indices = th.cat(out1, axis=1)
good_values = th.cat(out2, axis=1)

In [15]:
good_indices.shape, good_values.shape 

(torch.Size([38000, 1860]), torch.Size([38000, 1860]))

In [16]:
path = '/mnt/lwll/lwll-coral/hrant/cs_patches_256/predictions_knn/'

np.save(os.path.join(path, 'mae_val38k_10x186_NN.npy'), {
    "good_indices": good_indices, 
    "good_values": good_values
})

In [1]:
import os
path = '/mnt/lwll/lwll-coral/hrant/embeddings/fair1m/predictions_knn'
# path_to_save = os.path.join(path, f'{model}train_72_test_all_10x186_NN.h5')
path_to_save = os.path.join(path, 'ADADSF_test_all')

In [2]:
path_to_save

'/mnt/lwll/lwll-coral/hrant/embeddings/fair1m/predictions_knn/ADADSF_test_all'