In [1]:
#Import the required Libraries
import faiss
import pandas as pd
import matplotlib.pyplot as plt

import os
from glob import glob
import json
import numpy as np
import pickle

from PIL import Image
import random

# The Documentation For Facebook's Faiss Library for PCA, clustering etc can be found at 
# https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization

In [2]:

# data_path = '/mnt/largedrive0/katariap/feature_extraction/data/Dataset/DenseNet_Features'
feature_vector_folder = '/mnt/largedrive0/katariap/feature_extraction/data/Dataset/DenseNet_Features' #Path to Folder with Extracted feature Vectors
densenet_features_files = glob(feature_vector_folder+'/*.json') #Extract the file List in the feature_vector_folder using Glob

In [3]:

# Creating a List of Features Values for all WSIs

feature_values = []
file_names = []

for feature_file in densenet_features_files:
    #Json loads is used to read each feature vector file
    with open(feature_file,"r") as file:
        feature_dictionary = json.loads(file.read())
    
    feature_values = feature_values + [np.array(list(feature_dictionary.values()))]
    file_names = file_names + [np.array(list(feature_dictionary.keys()))]


#Estimated Run Time is 3 minutes

In [None]:
feature_list = []
file_list = []
i = 0 #Counter helps to keep track of files processed.

# Create A List with feature values corresponding to each Image Patch
for folder in feature_values:
    for file in folder:
        feature_list = feature_list + [file]
    i = i + 1
    print(i)

#Estimated Run Time is 7 minutes

In [None]:
# Final List of all Image Patch Names (Each Name contains the Patch Path)
i = 0
for folder in file_names:
    for file in folder:
        file_list = file_list + [file]
    i = i +1
    print(i)

In [6]:
# Using Facebook Faiss for PCA. Reducing 1024 Feature Vectors to 500
# Faiss only uses 'float32' type arrays

mat = faiss.PCAMatrix (1024, 500)
mat.train(np.array(feature_list).astype('float32'))
assert mat.is_trained
feature_values_transformed = mat.apply(np.array(feature_list).astype('float32'))

In [7]:
# Sanity Check for the count
print(len(file_list))
print(len(feature_list))

408215
408215


In [8]:
#Number of clusters to form on the basis of Features. I Found a number between 15 to 20 to be ideal to remove unwanted Image Patches
ncentroids = 20
niter = 20
verbose = True
kmeans = faiss.Kmeans(feature_values_transformed.shape[1], ncentroids, niter=niter, verbose=verbose)
kmeans.train(feature_values_transformed)

Sampling a subset of 5120 / 408215 for training
Clustering 5120 points in 500D to 20 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.11 s
  Iteration 19 (2.09 s, search 1.52 s): objective=788919 imbalance=1.070 nsplit=0           


788919.125

In [9]:
D, I = kmeans.index.search(feature_values_transformed, 1)
# Mapping To each Cluster Centroid for all patches

In [None]:
d = feature_values_transformed.shape[1]
index = faiss.IndexFlatL2 (d)
index.add (feature_values_transformed)
D_c, I_c = index.search (kmeans.centroids, 20) # To find 20 Representative Patches corresponding to Each Centroid

In [10]:
# Save the Clustering Results to a CSV File, To avoid recalculation in case jupyter notebook fails.
data_frame = pd.DataFrame(file_list,columns = ['filename'])
data_frame['Cluster'] = I
data_frame['Distance'] = D
data_frame.to_csv('/mnt/largedrive0/katariap/feature_extraction/data/Dataset/Clusters_densenet.csv') #Change Path According to Requirements

In [None]:
# Creating Lists Cluster Wise
# Each Cluster Index will contain the files corresponding to it
clusters = {}
for i in range(len(file_list)):
    if (I[i] not in list(clusters.keys())):
        
        clusters[I[i][0]] = [file_list[i]]
    else:
        clusters[I[i][0]] = clusters[I[i][0]] + [file_list[i]]

In [None]:
# To save the Clusters List as a pickle file.
with open('/mnt/largedrive0/katariap/feature_extraction/data/Dataset/clusters.pickle', 'wb') as file:
    pickle.dump(clusters, file)

In [None]:
# Run This cell in case loading of clusters is required from a file
clusters = {}
cluster_file = '/mnt/largedrive0/katariap/feature_extraction/data/Dataset/clusters.pickle'
with open(cluster_file,'rb') as data_file:
    clusters = pickle.load(data_file)

In [None]:
#Cluster Visualization. Visualize the Clusters By Running This Cell. 
for number in range(ncentroids):

        fig = plt.figure(figsize = (30,30))
        files = clusters[number]

        if len(files) > 10:
            files = random.sample(files,10) # 10 Random Files are selected from each cluster. Each Run display Different Files
        for index,file in enumerate(files):
            plt.subplot(5,5,index+1)
            name = file.split('/')[-1]
            img = Image.open(file)
            img = np.array(img)
            plt.imshow(img)
            plt.axis('off')
            plt.title(name ,fontsize = 7)

In [None]:
final_list = []
selected_clusters = [1,2,4,6,7,8,9,10,11,12,13,17,18,19]  #Select The Clusters For Creating Final Dataset. Cluster Numbers START from 0.
for i in selected_clusters:
    final_list = final_list + clusters[i]

In [None]:
# The selected Patches are saved as CSV File.
selected_patches = pd.DataFrame(final_list, columns = ['Patch'])
selected_patches.to_csv('/mnt/largedrive0/katariap/feature_extraction/data/Dataset/selected_after_clustering.csv')