# SC19B081


K-means is sensitive to the random choice of initial clusters. To improve 
your odds of getting a good clustering, implement a wrapper function 
restarts.ipynb to do R random restarts and return the clustering with 
the lowest SSD error.

Inputs: same as for my_kmeans.ipynb plus
- a scalar R denoting how many random restarts to perform.

Outputs: same as for my_kmeans.ipynb, but
- ssd is the lowest SSD across all random restarts.

In [1]:
import numpy as np
def pdist2(ref,samples):
    
    """
    Inputs:
    - ref:  reference matrix of size KxD
    - samples: sample matrix of size NxD
    
    Returns:
    - dist: Euclidean distance between the K ref points and N samples,
            size K,N
    
    """
    K,D1 = ref.shape
    N,D2 = samples.shape
    assert D1==D2, 'Dimensions of reference and samples dont match' 
    dist = np.zeros([K,N])
    for i in range(K):
        for j in range(N):
            dist[i,j] = np.linalg.norm(ref[i,:] - samples[j,:])
    
    return dist
def k_means(A, K, iters):
    """
    Inputs:
    - A: matrix A of size NxD, where N is number of samples and D is the dimensionality of Feature representation
    - K: number of clusters
    - iters: number of iterations
    
    Returns:
    - cluster_id: Nx1 vector, with class labels in it, class labels : 1 to K
    - cluster_center: KxD matrix containing center of each cluster
    - ssd: sum of squared distances  
     
    """
    N,D = A.shape
    A_max = np.max(A,axis=0)
    A_min = np.min(A,axis=0)
    cluster_center = A_min + (A_max-A_min)*np.random.rand(K,D)
    distances = np.zeros([K,N])
    for i in range(iters):
        distances = pdist2(cluster_center,A)
        cluster_id = np.argmin(distances,axis=0)
        classifying_matrix = np.zeros([N,D,K])
        id_count = np.zeros(K,dtype=int)
        for i,id in enumerate(cluster_id):
            id_count[id] += 1
            classifying_matrix[id_count[id]-1,:,id] = A[i,:]
        for i in range(K):
            cluster_center[i,:] = np.sum(classifying_matrix[:,:,i],axis=0)/id_count[i]
    
    ssd = np.sum((pdist2(cluster_center,A))**2)
     
    return cluster_id,cluster_center,ssd

In [2]:
def wrapper_kmeans(A,K,iters,R):
    ssd_old = np.inf
    for i in range(R):
        cluster_id,cluster_center,ssd_new = k_means(A,K,iters)
        if (ssd_new < ssd_old):
            cluster_id_best = cluster_id
            cluster_center_best = cluster_center
            ssd_old = ssd_new
    
    return cluster_id_best,cluster_center_best,ssd_old