In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
class KMeans(object):
    """K-Means clustering algorithm"""
    def __init__(self, k):
        self.k = k

    def cluster(self, X):
        """Perform KMeans clustering to find the centers of clusters"""
        # Initialization
        random_indexes = np.random.choice(len(X), self.k)
        centers = np.array([X[ind] for ind in random_indexes])
        clusters = [0] * (len(X))

        conti = False
        while True:
            # Find optimal partition of the data points given the centers
            for ix, x in enumerate(X):
                best = float("inf")

                for ind, center in enumerate(centers):
                    dist = np.sum((x - center)**2)
                    if dist < best:
                        best = dist
                        clus = ind

                if clus != clusters[ix]:
                    clusters[ix] = clus
                    conti = True

            # Re-compute the optimal centers given the partition
            add = {}
            for k in range(self.k):
                add[k] = np.zeros(len(X[0]))

            num = [0] * self.k
            for ix, ind in enumerate(clusters):
                add[ind] += X[ix]
                num[ind] += 1

            centers = np.array([add[j] / num[j] for j in range(self.k)])
            # stop if the partition hasn't changed
            if conti:
                conti = False
            else:
                break

        self.centers = centers
        self.clusters = clusters
        self.data = X

        return centers, clusters

    def square_error(self, Print=False):
        """Compute average square error to centers"""
        Error = 0
        for i in range(len(self.data)):
            Error += np.sum((self.data[i] - self.centers[self.clusters[i]])**2)

        Error = Error / len(self.data)
        if Print:
            print("Average square error to centers: %.4f" % Error)

        return Error

In [3]:
Data = pd.read_csv('Data/hw4_kmeans_train.dat', header=None, delim_whitespace=True)
Data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.8105,-0.35,0.4769,0.4541,-0.9829,0.5252,0.3838,-0.3408,-0.4824
1,-0.6273,-0.2097,0.9404,0.1143,0.3487,-0.5206,0.0061,0.5024,-0.6687
2,0.1624,-0.1173,0.426,-0.3607,-0.6632,0.4431,-0.8355,0.7206,-0.8977
3,-1.0,0.7758,-0.267,-0.888,-0.1099,-0.9183,-0.4086,0.8962,0.5841
4,0.8464,0.1762,0.2729,0.2724,0.8155,0.6096,-0.2844,0.98,0.3302


In [4]:
print('Start clustering...')
start = time.clock()
Ein = []

for i in range(500):
    K = KMeans(10)
    centers, clusters = K.cluster(Data.values)
    Ein.append(K.square_error())

print('\tAveraged Ein: %.4f, using %.2f seconds.' %
      (np.mean(Ein), time.clock() - start))
print('\nCenters:\n', centers)

Start clustering...
	Averaged Ein: 1.6912, using 21.16 seconds.

Centers:
 [[ 0.08195714  0.23433571  0.19116429  0.54655714 -0.62888571 -0.28687857
  -0.36791429 -0.19740714 -0.34687143]
 [-0.4973913  -0.10111304 -0.37882174 -0.33633913  0.2865087  -0.34223478
  -0.30155217 -0.03834348 -0.09670435]
 [-0.61415556  0.60063333 -0.52266667  0.19634444 -0.3413     -0.14001111
   0.41495556 -0.0533      0.5704    ]
 [ 0.46733158 -0.17684211 -0.23305789  0.02689474  0.17701053  0.40945263
   0.18751579  0.17791579  0.21179474]
 [ 0.33022     0.3371     -0.19694    -0.29791    -0.63827     0.46504
  -0.37553     0.49669    -0.51448   ]
 [        nan         nan         nan         nan         nan         nan
          nan         nan         nan]
 [-0.19435    -0.548925    0.371125    0.683275    0.312675   -0.13255
   0.703025   -0.260975    0.721875  ]
 [        nan         nan         nan         nan         nan         nan
          nan         nan         nan]
 [ 0.04146667  0.11961333  