In [1]:
#Import scikit-learn dataset library
from sklearn import datasets
import numpy as np

#Load dataset
digits = datasets.load_digits()
print('features: ' + str(digits.feature_names) + '\n')
print('targets:  ' + str(digits.target_names) + '\n')
X_train = digits.data
y_train = digits.target
train_num_samples, train_num_features = X_train.shape

features: ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']

targets:  [0 1 2 3 4 5 6 7 8 9]



In [2]:
def fit(k, iters):
    #init
    clusters = []
    loss_arr = []
    for _ in range(k):
        clusters.append([])
    #pick a random k points to be the means
    cluster_means_idx = get_random_means(k)
    cluster_means = []
    for _ in cluster_means_idx:
        cluster_means.append(X_train[_])

    #run optimization n times
    for _ in range(iters):
        #assign points to clusters
        clusters = assign_clusters(k,cluster_means) #good
        #set the new means
        cluster_means = assign_means(clusters, k)
        #print(cluster_means)
        if (_%2 == 0): #every 2 iterations calculate the loss
            loss_arr.append(k_loss(labels(clusters), cluster_means))

    return labels(clusters), cluster_means, loss_arr
    

def assign_means(clusters, k):
    #reset means
    cluster_means = np.zeros((k,train_num_features))
    #for each cluster
    for idx, c in enumerate(clusters):
        distances = []
        for point in c: #get distances between every point in cluster
            distances.append(distance(point,c))
        d_sums = []
        for d in range(len(distances)): #sum all distances per point
            d_sum = sum(distances[d])
            d_sums.append(d_sum)
        cluster_means[idx]=X_train[c[np.argmin(d_sums)]] #smalles distance is the new mean point
    return cluster_means

def get_random_means(k):
    rand_means = []
    rand_means = np.random.choice(train_num_samples, k , False)
    return rand_means

def distance(x, means):
    dist = []
    for m in means:
        dist.append(np.sqrt(np.sum((x-m)**2)))
    return dist

def assign_clusters(k, means):
    #clear clusters
    clusters = []
    for _ in range(k):
        clusters.append([])

    #for each point assign closest mean
    for idx, point in enumerate(X_train):
        distances = [distance(point, means)]
        closest_mean_idx = np.argmin(distances)
        clusters[closest_mean_idx].append(idx)
    return (clusters)

def accuracy(predictions,y,k):
    counter = 0
    count_arr = np.zeros((k,k))
    for _ in range(train_num_samples):
        count_arr[y[_],predictions[_]] +=1

    return count_arr

def labels(clusters):
    labels = [None] * train_num_samples
    for c_idx, c in enumerate(clusters):
        for point_idx in c:
            labels[point_idx] = c_idx
    return labels

def k_loss(y_pred, means):
    loss = 0
    for idx in range(train_num_samples):
        mean = means[y_pred[idx]]
        loss+= np.sum((X_train[idx] - mean)**2)
    return(loss/train_num_samples)

In [3]:
predictions, means, loss_arr = fit(10, 9)

print('Target value number (rows) by prediction number(columns):')
print(accuracy(predictions,y_train,10))
print('A large majority of a single prediction number for the target value shows relative \'accuracy\'')

k_loss(predictions, means)
print('\nLoss is shows for every 2nd iteration, avg loss per iteration: \n' + str(loss_arr))

Target value number (rows) by prediction number(columns):
[[  0.   0.   0.   0.   1.   0.   0.   0.   0. 177.]
 [ 40.   0.  12.  49.  15.   5.   0.  58.   3.   0.]
 [  1.   2.   0.   0.   0.  64.  84.  18.   6.   2.]
 [  0. 100.   4.  12.   0.   1.   2.  60.   4.   0.]
 [  0.   0.   1.   2. 161.   0.   0.   2.   5.  10.]
 [  0.   9. 109.  54.   6.   0.   0.   1.   0.   3.]
 [ 21.   9.   0.   0.  89.   2.   0.   0.   0.  60.]
 [  0.   0.   3.   0.   0.   2.   0.  89.  85.   0.]
 [  9.  19.  31.  45.   5.  17.   2.  27.   4.  15.]
 [  0.  56.   8.  98.   0.   0.   1.   4.  12.   1.]]
A large majority of a single prediction number for the target value shows relative 'accuracy'

Loss is shows for every 2nd iteration, avg loss per iteration: 
[1676.68224819143, 1542.6332776850306, 1349.8258208124653, 1671.5959933222036, 1407.561491374513]
