In [88]:
import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.signal import argrelmin

In [None]:
nat_mat = sio.loadmat('/Users/Katie/Dropbox/Data/Naturalistic/NewSolution/Naturalistic_11.25.17.mat')

In [147]:
#read in metrics from .mat files exported from matlab
metrics = {}
metrics['vi'] = nat_mat['VI'].T
metrics['sil'] = nat_mat['avg_sil'].T
metrics['hi'] = nat_mat['HI'].T
cc = nat_mat['cluster_consistency']
consistency = cc[:,1] / cc[:,0]
metrics['cc'] = consistency.reshape(9,1)

In [184]:
#remove all that have less than 50% cluster consistency
good = np.where(metrics['cc'] > 0.5)
good = np.add(good[0], 2) #cluster consistency starts at k=2
print "Cluster consistency metrics indicate that solutions {0} are valid.".format(good)
print "This means that the minimum number of consistently assigned experiments was at least half the mean,\na criterion for stability of cluster assignment."

Cluster consistency metrics indicate that solutions [2 3 4 5 6 7 8] are valid.
This means that the minimum number of consistently assigned experiments was at least half the mean,
 a criterion for stability of cluster assignment.


In [185]:
vi_good = argrelmin(metrics['vi'])
hi_good = argrelmin(metrics['hi'])
vi_good = np.add(vi_good[0], 3) #VI starts at k=3
print "Variation of information (VI) indicates that solutions {0} are stable.".format(vi_good)
hi_good = np.add(hi_good[0], 3) #HI starts at k=3
print "Hierarchy index (HI) indicates that solutions {0} are stable.".format(hi_good)

Variation of information indicates that solutions [6 8] are stable.
Hierarchy index indicates that solutions [4 6 8] are stable.


In [186]:
#find change of silhouette line for every K+1
sil_slope = np.ediff1d(metrics['sil'])

#find value of K+1 with smallest slope
#this indicates that K is "optimal"
sil_slope[sil_slope < 0] = 999. #negative silhouette changes indicate poor solutions and should not be considered
sil_best = np.where(sil_slope == np.min(sil_slope))
sil_best = np.add(sil_best, 2) #silhouette change starts at K+1=3, K=2
print "The smallest positive change in silhouette score is from {0} to the next solution,\nindicating that {0} presents optimal separation of data.".format(sil_best)

The smallest positive change in silhouette score is from [[6]] to the next solution,
indicating that [[6]] presents optimal separation of data.


In [190]:
agreement = np.intersect1d(vi_good, hi_good)
if agreement.size:
    print "VI and HI agree on solution(s) {0}.".format(agreement)
else:
    print "VI and HI do not agree on any solutions."
agreement = np.intersect1d(agreement, good)
if agreement.size:
    print "VI, HI, and cluster consistency agree on solution(s) {0}.".format(agreement)
else:
    print "VVI, HI, and cluster consistency do not agree on any solutions."
best = np.intersect1d(agreement, sil_best)
if best.size:
    print "The metrics agree that {0} is the best solution.".format(best)
else:
    print "All four metrics do not agree on a solution."

VI and HI agree on solution(s) [6 8].
VI, HI, and cluster consistency agree on solution(s) [6 8].
The metrics agree that [6] is the best solution.
