# Machine Learning 1 - Exercise Sheet 1

## Viktor Vironski (4330455), Andy Disser (5984875), Ngoc Anh Trung Pham (7176267)


### Exercise 1 - K-means Clustering

### 1.

In [284]:
from sklearn import datasets
import numpy as np

# prevent numpy exponential notation on print 
np.set_printoptions(suppress=True)

def kmeans(data, ncluster):
    # takes a dataset as input and uses the K-means algorithm to cluster with Euclidean distance the data 
    # intonclusterclusters and returns the respective cluster index for each data point aswell as the respective
    # centroid
    
    # k-means algorithm                                                          
    # 1 specify the number k of clusters to assign                               
    # 2 randomly initialize k centroids                                          
    # 3 repeat                                                                   
    # 4     expectation: assign each point to it's closest centroid              
    # 5     maximization: compute the new centroid (mean) of each cluster        
    # 6 until centroid positions do not change                                   
    
    # reshape dataset for broadcasting
    data_new = data.reshape(data.shape[0], 1, data.shape[1])
    
    # choose centroids and fill array to dimentions similar to dataset
    centroids = np.full((data.shape[0], ncluster, data.shape[1]), choose_random_centroids(data, ncluster))
        
    # Calculate Eucledian distances from centroid to sample for each centroid
    clusters = np.linalg.norm(data_new - centroids, axis = 2)
    
    # Recalculate clusters n number of times
    for i in range (1, 100):
        centroids = np.full((data.shape[0], ncluster, data.shape[1]), calculate_new_centroids(data_new, clusters))
        clusters = np.linalg.norm(data_new - centroids, axis = 2)
        
        
    clusters = calculate_indicator_matrix(clusters)
    
    return centroids[0], np.where(clusters == 1)[1]



def choose_random_centroids(data, k):
    
    # choose initial centroids    
    centroids = []
    
    # get number of samples
    number_of_samples = np.shape(data)[0]
    
    # choose random samples from dataset
    for i in range(k):
        random = np.random.randint(0, number_of_samples - 1)
        centroids.append(data[random])
    
    # return chosen centroids
    return np.array(centroids)



def calculate_new_centroids(data, clusters):
    
    indicator_matrix = calculate_indicator_matrix(clusters)
    
    # reshape clusters to use broadcasting
    sample_in_cluster = data * indicator_matrix.reshape(indicator_matrix.shape[0], indicator_matrix.shape[1], 1)
    
    # calculate new centroids by calcluating the mean over all samples in a given cluster
    new_centroids = np.sum(sample_in_cluster, axis = 0) / np.count_nonzero(sample_in_cluster, axis = 0)

    return new_centroids


    
def calculate_indicator_matrix(matrix):
    
    # transform matrix to 0-1-matrix
    for i in range(0, matrix.shape[0]):
        min = np.amin(matrix[i])
        for j in range(0, matrix.shape[1]):
            if matrix[i][j] > min:
                matrix[i][j] = 0
            else:
                matrix[i][j] = 1
                
    return matrix
    


iris = np.array(datasets.load_iris().data)

kmeans(iris, 5) 
    
    

(array([[6.9125    , 3.1       , 5.846875  , 2.13125   ],
        [5.24285714, 3.66785714, 1.5       , 0.28214286],
        [5.52962963, 2.62222222, 3.94074074, 1.21851852],
        [6.23658537, 2.85853659, 4.80731707, 1.62195122],
        [4.70454545, 3.12272727, 1.41363636, 0.2       ]]),
 array([1, 4, 4, 4, 1, 1, 4, 1, 4, 4, 1, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1,
        4, 1, 4, 4, 1, 1, 1, 4, 4, 1, 1, 1, 4, 4, 1, 1, 4, 1, 1, 4, 4, 1,
        1, 4, 1, 4, 1, 4, 3, 3, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 3,
        3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3,
        2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0,
        3, 3, 0, 3, 3, 0, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0,
        0, 3, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 3, 0, 3], dtype=int64))

### 2.