In [1]:
import openslide
import h5py
import pickle
import numpy as np
import scipy.sparse as sp
import cv2
import matplotlib.pyplot as plt
import nmslib

from PIL import Image
from tqdm import tqdm
from collections import Counter
from sklearn.decomposition import PCA

Image.MAX_IMAGE_PIXELS = 5e8

In [2]:
class Hnsw:

    def __init__(self, space='cosinesimil', index_params=None,
                 query_params=None, print_progress=True):
        self.space = space
        self.index_params = index_params
        self.query_params = query_params
        self.print_progress = print_progress

    def fit(self, X):
        index_params = self.index_params
        if index_params is None:
            index_params = {'M': 16, 'post': 0, 'efConstruction': 400}

        query_params = self.query_params
        if query_params is None:
            query_params = {'ef': 90}

        # this is the actual nmslib part, hopefully the syntax should
        # be pretty readable, the documentation also has a more verbiage
        # introduction: https://nmslib.github.io/nmslib/quickstart.html
        index = nmslib.init(space=self.space, method='hnsw')
        index.addDataPointBatch(X)
        index.createIndex(index_params, print_progress=self.print_progress)
        index.setQueryTimeParams(query_params)

        self.index_ = index
        self.index_params_ = index_params
        self.query_params_ = query_params
        return self

    def query(self, vector, topn):
        # the knnQuery returns indices and corresponding distance
        # we will throw the distance away for now
        indices, _ = self.index_.knnQuery(vector, k=topn)
        return indices

In [3]:
file_latent = h5py.File("./tcga_gbm_patch256_20x_latent/h5_files/TCGA-02-0001-01Z-00-DX1.83fce43e-42ac-4dcd-b156-2908e75f2e47.h5", 'r')

In [4]:
latent = np.array(file_latent['features'])
coords = np.array(file_latent['coords'])

In [14]:
latent[0].shape

(64, 64)

In [15]:
64 * 64

4096

In [5]:
features = np.zeros((latent.shape[0], 128))
for idx, latent_feat in tqdm(enumerate(latent)):
    count_feat = Counter(latent_feat.flatten())
    feat_index = count_feat.keys()
    feat_value = count_feat.values()
    features[idx][list(feat_index)] = list(feat_value)

11223it [00:08, 1387.03it/s]


In [16]:
model  = Hnsw(space="l2")

In [17]:
model = model.fit(features)

In [19]:
row_index = []
col_index = []
for row, count in tqdm(enumerate(features)):
    nearest_neighbor_index = model.query(features[row], topn=6)
    for col in range(1, len(nearest_neighbor_index)):
        row_index.append(row)
        col_index.append(nearest_neighbor_index[col])

11223it [00:00, 20606.04it/s]


In [22]:
adj = sp.coo_matrix((np.ones(len(row_index)), (row_index, col_index)), shape=(features.shape[0], features.shape[0]))