In [1]:
#Import scikit-learn dataset library
from sklearn import datasets
import numpy as np

#Load dataset
digits = datasets.load_digits()
print('features: ' + str(digits.feature_names) + '\n')
print('targets:  ' + str(digits.target_names) + '\n')
X_train = digits.data
y_train = digits.target
train_num_samples, train_num_features = X_train.shape

features: ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']

targets:  [0 1 2 3 4 5 6 7 8 9]



In [2]:
def fit(k, iters):
    #init
    clusters = []
    loss_arr = []
    for _ in range(k):
        clusters.append([])
    #pick a random k points to be the means
    cluster_means_idx = get_random_means(k)
    cluster_means = []
    for _ in cluster_means_idx:
        cluster_means.append(X_train[_])

    #run optimization n times
    for _ in range(iters):
        #assign points to clusters
        clusters = assign_clusters(k,cluster_means) #good
        #set the new means
        cluster_means = assign_means(clusters, k)
        if (_%5 == 0): #every 5 iterations calculate the loss
            loss_arr.append(k_loss(labels(clusters), cluster_means))

    return labels(clusters), cluster_means, loss_arr
    
def assign_means(clusters, k):
    cluster_means = np.zeros((k,train_num_features))
    for idx, c in enumerate(clusters):
        mean = np.mean(X_train[c],axis=0)
        cluster_means[idx]=mean
    return cluster_means

def get_random_means(k):
    rand_means = []
    rand_means = np.random.choice(train_num_samples, k , False)
    return rand_means

def distance(x, means):
    dist = []
    for m in means:
        dist.append(np.sqrt(np.sum((x-m)**2)))
    return dist

def assign_clusters(k, means):
    #clear clusters
    clusters = []
    for _ in range(k):
        clusters.append([])

    #for each point assign closest mean
    for idx, point in enumerate(X_train):
        distances = [distance(point, means)]
        closest_mean_idx = np.argmin(distances)
        clusters[closest_mean_idx].append(idx)
    
    return (clusters)

def accuracy(predictions,y,k):
    counter = 0
    count_arr = np.zeros((k,k))
    #for _ in range(train_num_samples):
    for _ in range(train_num_samples):
        count_arr[y[_],predictions[_]] +=1

    return count_arr

def labels(clusters):
    labels = [None] * train_num_samples
    for c_idx, c in enumerate(clusters):
        for point_idx in c:
            labels[point_idx] = c_idx
    return labels

def k_loss(y_pred, means):
    loss = 0
    for idx in range(train_num_samples):
        mean = means[y_pred[idx]]
        loss+= np.sum((X_train[idx] - mean) ** 2)
    return(loss/train_num_samples)

In [3]:
predictions,means,loss_arr = fit(10, 35)

print('Target value number (rows) by prediction number(columns):')
print(accuracy(predictions,y_train,10))
print('A large majority of a single prediction number for the target value shows relative \'accuracy\'')

print('\nLoss is shows for every 5th iteration, avg loss per iteration:' + str(loss_arr))

Target value number (rows) by prediction number(columns):
[[  3.   0.   1.   0.   0. 172.   0.   0.   0.   2.]
 [  0.  99.   1.   1.   0.   0.   0.  55.  26.   0.]
 [  2.   8.   0.  14.   3.   0.   0.   2. 148.   0.]
 [ 10.   6.   3. 157.   4.   0.   2.   0.   1.   0.]
 [  0.   2.   0.   0.   3.   1.   7.   3.   0. 165.]
 [ 51.   0. 127.   0.   0.   2.   0.   0.   0.   2.]
 [  0.  10.   0.   0.   0. 170.   0.   0.   0.   1.]
 [  0.   0.   0.   0.  85.   0.  94.   0.   0.   0.]
 [ 48. 103.   6.   2.   5.   1.   0.   5.   4.   0.]
 [141.   0.   3.   6.  10.   0.   1.  19.   0.   0.]]
A large majority of a single prediction number for the target value shows relative 'accuracy'

Loss is shows for every 5th iteration, avg loss per iteration:[810.5591829083663, 706.2526153135008, 688.5275174049453, 688.447784573799, 688.447784573799, 688.447784573799, 688.447784573799]
