functions.py

import tensorflow as tf
import numpy as np


#=================
def test(x,y):
    print(x)

    
def plot_clusters(all_samples, centroids, n_samples_per_cluster):
    # Plot out the different clusters
    # Choose a different colour for each cluster
    import matplotlib.pyplot as plt
    colour = plt.cm.rainbow(np.linspace(0,1,len(centroids)))
    markers=["x", "o", "+"]
    for i, centroid in enumerate(centroids):
        # Grab just the samples fpr the given cluster and plot them out with a new colour
        samples = all_samples[i*n_samples_per_cluster:(i+1)*n_samples_per_cluster]
        plt.scatter(samples[:,0], samples[:,1], c=colour[i])
        # Also plot centroid
        
        #plt.plot(centroid[0], centroid[1], markersize=35, marker="x"+str(i), color='k', mew=10)
        #ValueError: Unrecognized marker style x0
        plt.plot(centroid[0], centroid[1], markersize=35, marker=markers[i], color='k', mew=10)
        plt.plot(centroid[0], centroid[1], markersize=30, marker=markers[i], color='m', mew=5)
    plt.show()
    

#==================================    
def create_samples(n_clusters, n_samples_per_cluster, n_features, embiggen_factor, seed):
    np.random.seed(seed)
    slices = []
    centroids = []
    # Create samples for each cluster
    for i in range(n_clusters):
        samples = tf.random_normal((n_samples_per_cluster, n_features),
                               mean=0.0, stddev=5.0, dtype=tf.float32, seed=seed, name="cluster_{}".format(i))
        tmp=np.random.rand(1,n_features)
        tmp=tmp.astype(np.float32)
        current_centroid = (tmp * embiggen_factor) - (embiggen_factor/2)
        #print(current_centroid.shape)
        #print(type(current_centroid[0][0]))
        centroids.append(current_centroid)
        samples += current_centroid
        slices.append(samples)
    # Create a big "samples" dataset
    samples = tf.concat(0, slices, name='samples')
    centroids = tf.concat(0, centroids, name='centroids')
    return centroids, samples
    
    
#============
def choose_random_centroids(samples, n_clusters, *args):
    #for ar in args:
    #  print ar
    seed=0
    if len(args)>0 :
        seed=args[0]
        print(type(seed))
        
    # Step 0: Initialisation: Select `n_clusters` number of random points
    n_samples = tf.shape(samples)[0]
    random_indices = tf.random_shuffle(tf.range(0, n_samples), seed=seed)
    begin = [0,]
    size = [n_clusters,]
    size[0] = n_clusters
    centroid_indices = tf.slice(random_indices, begin, size)
    initial_centroids = tf.gather(samples, centroid_indices)
    return initial_centroids

#===================
def assign_to_nearest(samples, centroids):
    # Finds the nearest centroid for each sample

    # START from http://esciencegroup.com/2016/01/05/an-encounter-with-googles-tensorflow/
    expanded_vectors = tf.expand_dims(samples, 0)
    expanded_centroids = tf.expand_dims(centroids, 1)
    distances = tf.reduce_sum( tf.square(
               tf.sub(expanded_vectors, expanded_centroids)), 2)
    mins = tf.argmin(distances, 0)
    # END from http://esciencegroup.com/2016/01/05/an-encounter-with-googles-tensorflow/
    nearest_indices = mins
    return nearest_indices
    
    
#===
def update_centroids(samples, nearest_indices, n_clusters):
    # Updates the centroid to be the mean of all samples associated with it.
    nearest_indices = tf.to_int32(nearest_indices)
    partitions = tf.dynamic_partition(samples, nearest_indices, n_clusters)
    new_centroids = tf.concat(0, [tf.expand_dims(tf.reduce_mean(partition, 0), 0) for partition in partitions])
    return new_centroids

"""
This code takes the nearest indices for each sample, and grabs those out as separate groups 
using tf.dynamic_partition. 

From here, we use tf.reduce_mean on a single group to find the average of that group, 
forming its new centroid. 
From here, we just tf.concat them together to form our new centroids.
"""