# K-means Clustering and visualization
Tutorial from: http://www.cs.uoi.gr/~tsap/teaching/2016-cse012/slides/Intro-to-Clustering.html

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import concurrent.futures as cf
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import nibabel as nib
import os
%matplotlib inline
%run nifti_tools.ipynb

In [2]:
# Loading the Principal Components of ABA
X = np.memmap('/data/bioprotean/SVD/vg/rc.mymemmap', dtype='float32', mode='r', shape=(159326,271))

In [None]:
# Running K-means clustering with n clusters
n_clusters = 8
kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=50, random_state=0)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
kmeans_labels = kmeans.labels_
error = kmeans.inertia_

# Finding optimum value for K
Tutorial from https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

In [None]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 30)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
 
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X.shape[0])
    inertias.append(kmeanModel.inertia_)
 
    mapping1[k] = sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X.shape[0]
    mapping2[k] = kmeanModel.inertia_

plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

# Multiprocessing for finding optimum value for K

In [None]:
# Loading the principal components array
pc_n271_array = np.load('/data/bioprotean/ABA/PCA/PC_n271.npy')

# Defining min, max and step values
min = 10
max = 301
step = 10
number_Ks = int(((max-min)/step)+1)

# Creating an array to store inertias
inertias_arr = np.zeros((2,number_Ks))
inertias_arr[0,:] = np.array(range(min,max,step))

def OptimumK_parallel(n):
    # Running K-means clustering
    kmeans = KMeans(init='k-means++', n_clusters=n, n_init=50, random_state=0).fit(pc_n271_array)
    kmeans.fit(pc_n271_array)
    
    # Adding the inertia to the results array
    inertias_arr[1,n] = kmeans.inertia_
    
with cf.ProcessPoolExecutor() as executor:
    numbers = range(min,max,step)
    results = executor.map(OptimumK_parallel, numbers)

np.save('/data/bioprotean/ABA/PCA/optimum_K/inertias_arr.npy', inertias_arr)

In [9]:
# Loading the array
inertias_arr = np.load('/data/bioprotean/ABA/PCA/optimum_K/inertias_arr.npy')

# Running K-means in for loop

In [4]:
# Loading the principal components array
pc_n271_array = np.load('/data/bioprotean/ABA/PCA/PC_n271.npy')

# Base image to copy header from
base = '/data/bioprotean/ABA/PCA/clusters/nclusters_fixed10.nii'

for i in range (51,53):
    # Running K-means clustering
    kmeans = KMeans(init='k-means++', n_clusters=i, n_init=50, random_state=0)
    kmeans.fit_predict(pc_n271_array)
    labels_array = kmeans.labels_.reshape(67,58,41)
    
    # Saving as nii file
    output_file = '/data/bioprotean/ABA/PCA/clusters/nclusters_'+str(i)+'.nii'
    array_to_nifti(labels_array, output_file)

# Multiprocessing

In [None]:
# Loading the principal components array
pc_n271_array = np.load('/data/bioprotean/ABA/PCA/PC_n271.npy')

# Base image to copy header from
base = '/data/bioprotean/ABA/PCA/clusters/nclusters_fixed10.nii'

def Kmeans_parallel(n):
    # Running K-means clustering
    kmeans = KMeans(init='k-means++', n_clusters=n, n_init=50, random_state=0)
    kmeans.fit_predict(pc_n271_array)
    labels_array = kmeans.labels_.reshape(67,58,41)
    
    # Saving as nii file
    output_file = '/data/bioprotean/ABA/PCA/clusters/nclusters_'+str(n)+'.nii'
    array_to_nifti(labels_array, output_file)
    
    # Modifying the NIFTI header
    outfile = '/data/bioprotean/ABA/PCA/clusters/nclusters_fixed'+str(n)+'.nii'
    copy_nifti_header(base, output_file, outfile)
    
with cf.ProcessPoolExecutor() as executor:
    numbers = range(300,601,25)
    results = executor.map(Kmeans_parallel, numbers)

In [2]:
# Saving the NII files as NPY
for i in range(1, 285):
    input = '/data/bioprotean/ABA/PCA/Kmeans_labels/NII/nclusters_fixed'+str(i)+'.nii'
    output = '/data/bioprotean/ABA/PCA/Kmeans_labels/NPY/nclusters_'+str(i)+'.npy'
    nifti_to_npy(input, output)