In [12]:
import Cluster2
import random, pylab
import numpy as np

$$variability(c) = \sum_{e \in c}(distance(mean(c),e))^2$$
where c is a cluster and e are all of the points in our dataset. Penalizes big highly diverse clusters more than small highly diverse clusters.
$$dissimilarity(C) = \sum_{c \in C}variability(c)$$
C is a set of clusters

Single Linkage: Distance between two clusters is the shortest distance from any pair of points in the clusters \
Complete Linkage: Farthest distance between two clusters

In [13]:
v1= np.array([1,2,4])
v2= np.array([1,2,3])
Cluster2.minkowskiDist(v1,v2,2)

1.0

In [14]:
class Patient(Cluster2.Example):
    pass

In [15]:
def scaleAttrs(vals):
    vals = pylab.array(vals)
    mean = sum(vals)/len(vals)
    sd = np.std(vals)
    vals = vals - mean
    return vals/sd #z = (x-mu)/sd 

def getData(toScale = False):
    #read in data
    hrList, stElevList, ageList, prevACSList, classList = [],[],[],[],[]
    cardiacData = open('cardiacData.txt', 'r')
    for l in cardiacData:
        l = l.split(',')
        hrList.append(int(l[0]))
        stElevList.append(int(l[1]))
        ageList.append(int(l[2]))
        prevACSList.append(int(l[3]))
        classList.append(int(l[4])) # Predicted variable
    if toScale: # Z-scale Attrs
        hrList = scaleAttrs(hrList)
        stElevList = scaleAttrs(stElevList)
        ageList = scaleAttrs(ageList)
        prevACSList = scaleAttrs(prevACSList)
    #Build points
    points = []
    for i in range(len(hrList)):
        features = pylab.array([hrList[i], prevACSList[i],\
                                stElevList[i], ageList[i]])
        pIndex = str(i)
        points.append(Cluster2.Example('P'+ pIndex, features, classList[i]))
    return points
    

In [16]:
def kmeans(examples, k, verbose = False):
    #Get k randomly chosen initial centroids, create cluster for each
    initialCentroids = random.sample(examples, k) # Pick Random Centroids
    # print(initialCentroids)
    clusters = []
    for e in initialCentroids:
        clusters.append(Cluster2.Cluster([e])) # Append the Random Centroids to a cluster
        # Clusters will have only one point (centroid)
    # print(clusters[0].centroid)
    # print(clusters[0].examples[0])
    #Iterate until centroids do not change
    converged = False
    numIterations = 0
    while not converged:
        numIterations += 1
        #Create a list containing k distinct empty lists
        newClusters = []
        for i in range(k):
            newClusters.append([]) # New Cluster will have k number of lists
            
        #Associate each example with closest centroid
        for e in examples:
            #Find the centroid closest to e
            smallestDistance = e.distance(clusters[0].getCentroid()) 
            # Gets Distance between each datapoint and the cluster 0 and sets it as value to beat
            index = 0
            for i in range(1, k):
                distance = e.distance(clusters[i].getCentroid()) 
                # Gets Distance between each datapoint and the other clusters
                if distance < smallestDistance:
                    smallestDistance = distance
                    index = i
            #Index would be the closest cluster to Datapoint
            #Add e to the list of examples for appropriate cluster
            newClusters[index].append(e)
            
        for c in newClusters: #Avoid having empty clusters
            if len(c) == 0:
                raise ValueError('Empty Cluster')
        
        #Update each cluster; check if a centroid has changed
        converged = True
        for i in range(k):
            if clusters[i].update(newClusters[i]) > 0.0:
                converged = False
        if verbose:
            print('Iteration #' + str(numIterations))
            for c in clusters:
                print(c)
            print('') #add blank line
    return clusters

In [17]:
def trykmeans(examples, numClusters, numTrials, verbose = False):
    """Calls kmeans numTrials times and returns the result with the
          lowest dissimilarity"""
    best = kmeans(examples, numClusters, verbose) # Start with a set of clusters
    minDissimilarity = Cluster2.dissimilarity(best) # Get their dissimilarity and treat it as minimum dissimilarity
    trial = 1
    while trial < numTrials:
        try:
            clusters = kmeans(examples, numClusters, verbose) # Come up with new set of clusters
        except ValueError:
            continue #If failed, try again
        currDissimilarity = Cluster2.dissimilarity(clusters)
        if currDissimilarity < minDissimilarity: # If dissimilarity is better then take that cluster instead
            best = clusters
            minDissimilarity = currDissimilarity
        trial += 1
    return best

def printClustering(clustering):
    """Assumes: clustering is a sequence of clusters
       Prints information about each cluster
       Returns list of fraction of pos cases in each cluster"""
    posFracs = []
    for c in clustering:
        numPts = 0
        numPos = 0
        for p in c.members():
            numPts += 1
            if p.getLabel() == 1:
                numPos += 1
        fracPos = numPos/numPts
        posFracs.append(fracPos)
        print('Cluster of size', numPts, 'with fraction of positives =',
              round(fracPos, 4))
    return pylab.array(posFracs)

def testClustering(patients, numClusters, seed = 0, numTrials = 5):
    random.seed(seed)
    bestClustering = trykmeans(patients, numClusters, numTrials)
    posFracs = printClustering(bestClustering)
    return posFracs

In [18]:
patients = getData(True)

In [19]:
kmeans(patients, 2)

[<Cluster2.Cluster at 0x17d255d5c00>, <Cluster2.Cluster at 0x17d255f1120>]

In [20]:
getData()[0].getLabel()

0

In [21]:
getData()[0].getFeatures()

array([84,  0,  0, 55])

In [22]:
patients = getData(True)
for k in (2,):
    print('\n     Test k-means (k = ' + str(k) + ')')
    posFracs = testClustering(patients, k)


     Test k-means (k = 2)
Cluster of size 224 with fraction of positives = 0.2902
Cluster of size 26 with fraction of positives = 0.6923


In [23]:
numPos = 0
for p in patients:
   if p.getLabel() == 1:
       numPos += 1
print('Total number of positive patients =', numPos) 

Total number of positive patients = 83


Missing most positives in Cluster 2 since it only has 26 and there are 83 positives

In [24]:
patients = getData(True)
for k in (2,4,6):
    print('\n     Test k-means (k = ' + str(k) + ')')
    posFracs = testClustering(patients, k)


     Test k-means (k = 2)
Cluster of size 224 with fraction of positives = 0.2902
Cluster of size 26 with fraction of positives = 0.6923

     Test k-means (k = 4)
Cluster of size 62 with fraction of positives = 0.0645
Cluster of size 86 with fraction of positives = 0.0814
Cluster of size 26 with fraction of positives = 0.6923
Cluster of size 76 with fraction of positives = 0.7105

     Test k-means (k = 6)
Cluster of size 50 with fraction of positives = 0.04
Cluster of size 54 with fraction of positives = 0.0926
Cluster of size 43 with fraction of positives = 0.6512
Cluster of size 26 with fraction of positives = 0.6923
Cluster of size 33 with fraction of positives = 0.7879
Cluster of size 44 with fraction of positives = 0.0909
