In [42]:
import time
from os.path import join, basename
import re
import matplotlib.pyplot as plt
import pylab
import pyale
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import k_means, dbscan, SpectralClustering, FeatureAgglomeration
from sklearn.metrics import silhouette_score, normalized_mutual_info_score
from scipy.cluster import hierarchy
import scipy.io as sio
import seaborn as sns
%matplotlib inline

In [7]:
from math import log

def variation_of_information(X, Y):
    #from https://gist.github.com/jwcarr/626cbc80e0006b526688
    n = float(sum([len(x) for x in X]))
    sigma = 0.0
    for x in X:
        p = len(x) / n
        for y in Y:
            q = len(y) / n
            r = len(set(x) & set(y)) / n
            if r > 0.0:
                sigma += r * (log(r / p, 2) + log(r / q, 2))
    return abs(sigma)

In [15]:
out_dir = '/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/python_output'

In [2]:
matlab_nat = sio.loadmat('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/Naturalistic_11.25.17.mat')

In [10]:
corrmat = matlab_nat['corrmat']
corrmat2 = np.corrcoef(corrmat)
minus_r = 1 - corrmat2

In [11]:
k = np.arange(2, 11)
k_labels = {}
kk_labels = {}
w_labels = {}

for i in k:
    #Kernel Kmeans nonlinear
    start = time.time()
    kernelk = SpectralClustering(n_clusters=i, eigen_solver=None, n_init=1000, gamma=1.0, affinity='rbf', assign_labels='kmeans', degree=3, coef0=1, n_jobs=1)
    labels = kernelk.fit_predict(minus_r)
    print "Kernel k-means clustering with k = {0}: %.2fs".format(i) % (time.time() - start) 
    #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kernelk_labels_{0}.txt'.format(i), labels, delimiter='\t')
    kk_labels[i] = labels

    #Ward's
    #calculate pdist (1 - r) 
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=i, linkage='ward', memory='nilearn_cache')
    ward.fit(minus_r)
    #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/ward_labels_{0}.txt'.format(i), ward.labels_, delimiter='\t')
    w_labels[i] = ward.labels_
    print "Ward's clustering with k = {0}: %.2fs".format(i) % (time.time() - start)


    #KMeans clustering
    start = time.time()
    [centroid, label, inertia] = k_means(minus_r, i, init='k-means++', precompute_distances='auto',
                                         n_init=1000, max_iter=1023, verbose=False, tol=0.0001,
                                         random_state=None, copy_x=True, n_jobs=1, algorithm='auto',
                                         return_n_iter=False)
    k_labels[i] = label
    #np.savetxt('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/kmeans_labels_{0}.txt'.format(i), label, delimiter='\t')
    print "K-means clustering with k = {0}: %.2fs".format(i) % (time.time() - start)
    

Kernel k-means clustering with k = 2: 1.20s
Ward's clustering with k = 2: 0.01s
K-means clustering with k = 2: 5.59s
Kernel k-means clustering with k = 3: 1.96s
Ward's clustering with k = 3: 0.01s
K-means clustering with k = 3: 7.44s
Kernel k-means clustering with k = 4: 2.74s
Ward's clustering with k = 4: 0.01s
K-means clustering with k = 4: 7.83s
Kernel k-means clustering with k = 5: 2.71s
Ward's clustering with k = 5: 0.01s
K-means clustering with k = 5: 8.55s
Kernel k-means clustering with k = 6: 2.58s
Ward's clustering with k = 6: 0.01s
K-means clustering with k = 6: 8.52s
Kernel k-means clustering with k = 7: 2.93s
Ward's clustering with k = 7: 0.01s
K-means clustering with k = 7: 9.48s
Kernel k-means clustering with k = 8: 3.19s
Ward's clustering with k = 8: 0.01s
K-means clustering with k = 8: 9.67s
Kernel k-means clustering with k = 9: 3.56s
Ward's clustering with k = 9: 0.01s
K-means clustering with k = 9: 10.26s
Kernel k-means clustering with k = 10: 3.58s
Ward's clustering 

In [5]:
from sklearn import cluster

eps = np.arange(0.01, 2, 0.01)
db_labels = {}

for i in eps:
    [core, labels] = cluster.dbscan(minus_r, eps=i, metric='precomputed')
    db_labels['eps = {0}'.format(i)] = labels

dbscan_out = pd.Series(db_labels)
dbscan_out.to_csv('{0}/dbscan.txt'.format(out_dir), sep='\t')

In [43]:
k_sils = []
kk_sils = []
w_sils = []
matlab_sils = []
mk_kk_mutual_info = []
mk_w_mutual_info = []
mk_pk_mutual_info = []


k = np.arange(2, 11)

for i in k:
    j = i-2
    
    matlab_soln = matlab_nat['IDX'][0, j].flatten()
    matlab_silhouette = silhouette_score(minus_r, matlab_soln, metric='precomputed', random_state=None)
    matlab_sils.append(matlab_silhouette)
    
    
    k_silhouette = silhouette_score(minus_r, k_labels[i], metric='precomputed', random_state=None)
    k_sils.append(k_silhouette)

    kk_silhouette = silhouette_score(minus_r, kk_labels[i], metric='precomputed', random_state=None)
    kk_sils.append(kk_silhouette)

    w_silhouette = silhouette_score(minus_r, w_labels[i], metric='precomputed', random_state=None)
    w_sils.append(w_silhouette)
    
    k_labels[i].reshape(-1, 1)
    kk_labels[i].reshape(-1, 1)
    w_labels[i].reshape(-1, 1)

    mk_kk_nmi = normalized_mutual_info_score(matlab_soln, kk_labels[i])
    mk_kk_mutual_info.append(mk_kk_nmi)
    
    mk_pk_nmi = normalized_mutual_info_score(k_labels[i], matlab_soln)
    mk_pk_mutual_info.append(mk_pk_nmi)

    mk_w_nmi = normalized_mutual_info_score(matlab_soln, w_labels[i])
    mk_w_mutual_info.append(mk_w_nmi)

np.savetxt('{0}/wards_silhouettes.txt'.format(out_dir), w_sils, delimiter='\t')
np.savetxt('{0}/kernel_kmean_silhouettes.txt'.format(out_dir), kk_sils, delimiter='\t')
np.savetxt('{0}/python_kmean_silhouettes.txt'.format(out_dir), k_sils, delimiter='\t')
np.savetxt('{0}/matlab_kmean_silhouettes.txt'.format(out_dir), matlab_sils, delimiter='\t')

np.savetxt('{0}/mk_kk_nmi.txt'.format(out_dir), mk_kk_mutual_info, delimiter='\t')
np.savetxt('{0}/mk_w_nmi.txt'.format(out_dir), mk_w_mutual_info, delimiter='\t')
np.savetxt('{0}/mk_pk_nmi.txt'.format(out_dir), mk_pk_mutual_info, delimiter='\t')

In [8]:
k_clusters_idx = {}
kk_clusters_idx = {}
w_clusters_idx = {}
mk_clusters_idx = 

for i in k:
    k_clusters = []
    kk_clusters = []
    w_clusters = []
    for j in range(i):
        k_clusters.append(list(np.where(k_labels[i-2] == j)[0]))
        kk_clusters.append(list(np.where(kk_labels[i-2] == j)[0]))
        w_clusters.append(list(np.where(w_labels[i-2] == j)[0]))
    k_clusters_idx['Solution {0}'.format(i)] = k_clusters
    kk_clusters_idx['Solution {0}'.format(i)] = kk_clusters
    w_clusters_idx['Solution {0}'.format(i)] = w_clusters

In [113]:
vi_k = {}
vi_kk = {}
vi_w = {}
vi_mk = {}


for i in k[:-1]:
    j = k_clusters_idx['Solution {0}'.format(i)]
    z = k_clusters_idx['Solution {0}'.format(i+1)]
    k_vi = variation_of_information(j, z)
    vi_k[i+1] = k_vi
    
    j = kk_clusters_idx['Solution {0}'.format(i)]
    z = kk_clusters_idx['Solution {0}'.format(i+1)]
    kk_vi = variation_of_information(j, z)
    vi_kk[i+1] = kk_vi
                                                                                          
    j = w_clusters_idx['Solution {0}'.format(i)]
    z = w_clusters_idx['Solution {0}'.format(i+1)]
    w_vi = variation_of_information(j, z)
    vi_w[i+1] = w_vi
    

wards_vi = pd.Series(vi_w)
#wards_vi.to_csv('{0}/wards_vi.csv'.format(out_dir), sep=',')

kmeans_vi = pd.Series(vi_k)
#kmeans_vi.to_csv('{0}/pythonkmean_vi.csv'.format(out_dir), sep=',')

kernelk_vi = pd.Series(vi_kk)
#kernelk_vi.to_csv('{0}/kernelkmean_vi.csv'.format(out_dir), sep=',')

vi_mk = [0.831299855, 0.894586133, 1.064756853, 0.575203598, 0.854956836, 0.809800235, 0.827846322, 1.277838909]
matlabk_vi = pd.Series(vi_mk, index=[3, 4, 5, 6, 7, 8, 9, 10])
#matlabk_vi.to_csv('{0}/matlabkmean_vi.csv'.format(out_dir), sep=',')

In [132]:
variation_info = pd.DataFrame([kernelk_vi, matlabk_vi, kmeans_vi, wards_vi]).transpose()

In [133]:
variation_info.to_csv('{0}/variation_of_information.csv'.format(out_dir), sep=',')