# Natural Computing - Assignment 2 - Swarm Intelligence
## Exercise 3 - PSO
#### Submission by group 25 (Chihab Amghane, Max Driessen, Jordy Naus)

This file contains our solution to exercise 3 of the "Swarm Intelligence" assignment of the Natural Computing course.

### Imports

In [1]:
from sklearn import datasets
import numpy as np
import random
import pdb
import matplotlib.pyplot as plt 
import itertools
from mpl_toolkits.mplot3d import Axes3D 

### Utility functions

#### Euclidean distance

In [2]:
def euclidean_distance(point_one, point_two):
    # Compute Euclidean distance between two points (dimensions need to match)
    return np.sqrt(np.sum((point_one-point_two)**2))

#### Quantization error

In [3]:
def quantization_error(centroids, datapoints):
    # Retrieve number of clusters
    n_clusters = len(centroids)
    
    # Initialize dictionary for cluster distance arrays
    clusters = {i:[] for i in range(n_clusters)}

    # Iterate through all datapoints
    for datapoint in datapoints:
        # Compute Euclidean distance between this datapoint and all centroids
        distances = [euclidean_distance(centroid,datapoint) for centroid in centroids]
        
        # Store distance to closest centroid in corresponding array (so that we don't need to compute it twice)
        clusters[np.argmin(distances)].append(min(distances))

    # Compute quantization error (using a simplified but equivalent version of the formula in the paper)
    errors_per_cluster = [sum(cluster)/len(cluster) if cluster else 0 for cluster in clusters.values()]
    error = sum(errors_per_cluster)/n_clusters
    
    # Return the computed quantization error
    return error

#### Extracting clusters

In [4]:
def extract_clusters(centroids, datapoints):
    # Retrieve number of clusters
    n_clusters = len(centroids)

    # Initialize dictionary for cluster arrays
    clusters = {i:[] for i in range(n_clusters)}
    
    # Iterate through all datapoints
    for datapoint in datapoints:
        # Compute Euclidean distance between this datapoint and all centroids
        distances = [euclidean_distance(centroid,datapoint) for centroid in centroids]

        # Store datapoint in cluster array corresponding to nearest centroid
        clusters[np.argmin(distances)].append(datapoint)
    
    # Return the computed clusters
    return clusters

### PSO clustering algorithm

In [5]:
def PSO_clustering(n_clusters, n_particles, n_iterations, datapoints, 
                   omega=0.7298, alpha_1=1.49618, alpha_2=1.49618, verbose=False):
    # Compute number of dimensions
    n_dims = datapoints.shape[1]
    
    # Initialize particles, using random datapoints as the centroids (to ensure reasonable initial centroids)
    particles, speeds = [], []
    for n in range(n_particles):
        # Select indices of random datapoints to use as centroids
        seed_idxs = np.random.permutation(range(datapoints.shape[0]))[:n_clusters]
        # Create the particle
        particle = datapoints[seed_idxs]
        # Add the particle to the list of particles
        particles.append(particle)
        # Add a small random speed to the list of speeds (result is similar to using zeros as initial speed)
        speeds.append(np.random.uniform(-1,1,particle.shape))
    
    # Initialize local & global bests
    local_bests = [([],float('inf')) for particle in particles]
    global_best = ([],float('inf'))
    
    # Perform iterations until termination
    for iteration in range(n_iterations):
        # Print iteration number if verbose is true
        if verbose:
            print(f"PSO iteration {iteration}")
        
        # For each particle:
        for i,particle in enumerate(particles):
            # Compute fitness (quantization error)
            fitness = quantization_error(particle, datapoints)
            # Update local best
            if fitness < local_bests[i][1]:
                local_bests[i] = (particle, fitness)
        
        # Update global best
        best_in_iteration = np.argmin([local_best[1] for local_best in local_bests])
        if local_bests[best_in_iteration][1] < global_best[1]:
            global_best = local_bests[best_in_iteration]
        
        # Update particle positions
        for i in range(n_particles):
            r_1 = np.random.uniform(0,1,particle.shape)
            r_2 = np.random.uniform(0,1,particle.shape)
            speeds[i] = omega*speeds[i] \
                      + np.multiply(alpha_1*r_1, local_bests[i][0]-particles[i]) \
                      + np.multiply(alpha_2*r_2, global_best[0]-particles[i])
            particles[i] = particles[i] + speeds[i]
    
    # Return the global best
    return global_best[0] 

### K-means clustering algorithm

In [6]:
def KMeans_clustering(n_clusters, max_iter, n_init, datapoints, verbose=False):
    # Compute number of dimensions
    n_dims = datapoints.shape[1]
    
    # Initialize storage for best solution found
    best_solution = ([], float('inf'))
    
    # For each initialization, perform a single k-means clustering iteration and update the best solution
    for init in range(n_init):
        # Print initialization number if verbose is true
        if verbose:
            print(f"KMeans initialization {init}")
            
        # Initialize cluster centroids, using random datapoints (to ensure reasonable initial centroids)
        seed_idxs = np.random.permutation(range(datapoints.shape[0]))[:n_clusters]
        centroids = datapoints[seed_idxs]
        
        # Initialize dictionary for cluster arrays
        clusters = {i:[] for i in range(n_clusters)}

        # For each iteration, do the following:
        for iteration in range(max_iter):
            # Assign datapoints to cluster corresponding to closest centroid
            for datapoint in datapoints:
                distances = [euclidean_distance(centroid,datapoint) for centroid in centroids]
                clusters[np.argmin(distances)].append(datapoint)
            
            # Recalculate cluster centroids (for non-empty clusters)
            for cluster_index in range(n_clusters):
                if clusters[cluster_index]:
                    centroids[cluster_index] = np.sum(clusters[cluster_index], axis=0) / len(clusters[cluster_index])
                    
        # Update best solution if there is no best solution yet or the new solution is better
        error = quantization_error(centroids, datapoints)
        if error < best_solution[1]:
            best_solution = (centroids, error)
        
    # Return the best solution found
    return best_solution[0]

### Loading datasets

#### Artificial dataset I 
(as described [here](https://scholar.google.nl/scholar?hl=nl&as_sdt=0%2C5&q=Van+der+Merwe%2C+D.+W.%2C+and+Andries+Petrus+Engelbrecht.+%22Data+clustering+using+particle+swarm+optimization%22&btnG=))

In [7]:
artificial_dataset_size = 400

artificial_X = np.array([(random.uniform(-1,1), random.uniform(-1,1)) for i in range(artificial_dataset_size)])
artificial_Y = np.array([int((z_1 >= 0.7) or ((z_1 <= 0.3) and (z_2 >= -0.2 - z_1))) for (z_1,z_2) in artificial_X])
artificial_n_clusters = len(np.unique(artificial_Y))

#### Iris dataset

In [8]:
iris = datasets.load_iris()

iris_X = iris.data
iris_Y = iris.target
iris_n_clusters = len(np.unique(iris_Y))

### Testing

In [9]:
n_trials = 30
n_particles = 5
n_initializations = 5
n_iterations = 100

In [10]:
def test_PSO_KMeans(dataset, n_trials, n_clusters, n_particles, n_iterations, verbose=False):
    # Initialize lists of errors
    PSO_errors = []
    KMeans_errors = []
    
    # For each trial, do:
    for trial in range(n_trials):
        # Print the trial number if verbose is true
        if verbose:
            print(f"Testing trial {trial}")
        
        # Solve using PSO clustering and compute the resulting quantization error
        PSO_solution = PSO_clustering(n_clusters, n_particles, n_iterations, dataset, verbose=verbose)
        PSO_errors.append(quantization_error(PSO_solution, dataset))
        
        # Solve using K-means clustering and compute the resulting quantization error
        KMeans_solution = KMeans_clustering(n_clusters, n_iterations, n_initializations, dataset, verbose=verbose)
        KMeans_errors.append(quantization_error(KMeans_solution, dataset))

    # Print the results
    if verbose:
        print("")
    print(f"Average PSO quantization error over 30 simulations: " \
          f"{np.average(PSO_errors)} ± {np.std(PSO_errors)}")
    print(f"Average KMeans quantization error over 30 simulations: " \
          f"{np.average(KMeans_errors)} ± {np.std(KMeans_errors)}")

#### Artificial dataset 1

In [11]:
test_PSO_KMeans(artificial_X, n_trials, artificial_n_clusters, n_particles, n_iterations, True)

trial 0
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
i

initialization 1
initialization 2
initialization 3
initialization 4
trial 6
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 7

#### Iris dataset

In [12]:
test_PSO_KMeans(iris_X, n_trials, iris_n_clusters, n_particles, n_iterations, True)   

trial 0
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
i

initialization 1
initialization 2
initialization 3
initialization 4
trial 6
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 7

## plot clusters for Iris dataset

In [13]:
## obtain clusters
pso_clusters, pso_cluster_indices = obtain_clusters(iris_solution_PSO, iris_X)

NameError: name 'obtain_clusters' is not defined

In [None]:
def threedimensional_clusters_plotter(feature_names, clusters, clusters_indices, algorithm_name):
    features = range(len(feature_names))
    combinations = list(itertools.combinations(features,3))
    fig = plt.figure(figsize=(25,5))
    
    
    non_empty_clusters = [cluster for cluster in clusters if len(cluster>0)]
    non_empty_clusters_indices = [cluster_indices for cluster_indices in clusters_indices if len(cluster_indices)>0]
    for index,combination in enumerate(combinations):
        x, y, z = combination
        ax = fig.add_subplot(1, 4, index+1 ,projection='3d')
        for cluster, cluster_indices in zip(non_empty_clusters,non_empty_clusters_indices):
            ax.scatter(cluster[:,x],cluster[:,y],cluster[:,z] ,c=iris_Y[cluster_indices])

        ax.set_xlabel(feature_names[x])
        ax.set_ylabel(feature_names[y])
        ax.set_zlabel(feature_names[z])
        
    plt.suptitle(f'Class indices of datapoints in the clusters when using {algorithm_name}')
    plt.show()


In [None]:
threedimensional_clusters_plotter(iris.feature_names, pso_clusters, pso_cluster_indices, 'kmeans')

In [None]:
threedimensional_clusters_plotter(iris.feature_names, clusters, cluster_indices, 'pso')