# Assignment 2

## Exercise 1

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [12]:
# test data
X = np.array([[5, 0], [6, 0], [5, 1], [10, 5], [21, 4], [19, 33],[20, 20], [30,30], [3,30],[20, 200], [30,210], [3,190], [2,195]])
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
print(f"Labels: {kmeans.labels_}\n")

print(f"Centroids:\n{kmeans.cluster_centers_}\n")

print(f"Silhouette Score: {silhouette_score(X,kmeans.labels_)}")


Labels: [1 1 1 1 1 2 2 2 2 0 0 0 0]

Centroids:
[[ 13.75 198.75]
 [  9.4    2.  ]
 [ 18.    28.25]]

Silhouette Score: 0.6710107898371873


In [10]:
def silhouette(X, labels):
  """
  Calculate the silhouette score in a classification task.

  Definition of coefficient: 
            Silhouette Coefficient = (x-y)/ max(x,y)
    where, y is the mean intra cluster distance: mean distance to the other instances in the same cluster. 
    x depicts mean nearest cluster distance i.e. mean distance to the instances of the next closest cluster.

  Parameters
  ----------
  X : array of data points

  labels : array of cluster label
  """

  # create dictionary with cluster labels as key and list of data points as values
  labs = dict()
  for l in set(labels):
    labs[l] = list()

  # arrange in clusters and store them in dictionary
  for obs, clus in zip(X, labels):
    labs[clus].append(np.array(obs))

  # intra-cluster distance
  X_intracluster_distance = []
  for l in labs:
    for dp in labs[l]:
      # mean pairwise distance
      # tmp_dist = np.array(0)
      tmp_dist = []
      filter_flag = True
      for dp_n in labs[l]:
        # compare with all others in cluster
        if np.array_equal(dp,dp_n) and filter_flag: 
          continue
        tmp_dist.append(np.linalg.norm(dp-dp_n))
        if (dp==dp_n).all():
          filter_flag = False
      # append to Array
      X_intracluster_distance.append(np.array(tmp_dist).mean(axis=0))

  
  # calculate centroids for inter-cluster distance 
  # and store them in dictionary with label as key and centroid position as value
  centroids = dict()
  for l in labs:
    centroids[l] = np.array(labs[l]).mean(axis=0)

  # inter-cluster distance
  X_intercluster_distance = []
  # for all data points (dp) in each cluster do...
  for l in labs:
    for dp in labs[l]:
      # find nearest other cluster
      tmp_inter_dist = []
      for c in centroids:
        #for all OTHER clusters
        if (l != c):
          tmp_inter_dist.append(np.linalg.norm(dp-centroids[c]))
      # append to array
      X_intercluster_distance.append(min(tmp_inter_dist))

  # calculate the silhouette score per observation
  X_score_per_dp = []
  for x, y in zip(X_intercluster_distance, X_intracluster_distance):
    X_score_per_dp.append((x-y)/ max(x,y))

  # the silhouette score is now the mean
  return np.array(X_score_per_dp).mean()

  # Debug options:
  # print("intra: " + str(X_intracluster_distance))
  # print("inter: " + str(X_intercluster_distance))

In [13]:
silhouette(X,kmeans.labels_)

0.6605599624135416