<a href="https://colab.research.google.com/github/Ashail33/Masters-work/blob/master/Cluster_eval_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def davies_bouldin_index(data, clusters):

  calculate means for each cluster
  cluster_means = [np.mean(cluster, axis=0) for cluster in clusters]

  #calculate intra-cluster distances
  intra_cluster_distances = [np.mean([distance(point, cluster_mean) for point in cluster]) for cluster, cluster_mean in zip(clusters, cluster_means)]

  #calculate inter-cluster distances
  inter_cluster_distances = []
  for i in range(len(clusters)):
    distances = []
      for j in range(len(clusters)):
        if i != j:
          distances.append(distance(cluster_means[i], cluster_means[j]))
          inter_cluster_distances.append(max(distances))

        #calculate Davies-Bouldin Index
        db_index = 0
        for i in range(len(clusters)):
        db_index += intra_cluster_distances[i] / inter_cluster_distances[i]

return db_index / len(clusters)

In [None]:
def intra_cluster_separation(clusters):

#initialize variable to store sum of intra-cluster distances
  intra_cluster_distance_sum = 0

#calculate intra-cluster distances for each cluster and add to sum
  for cluster in clusters:
    for point1 in cluster:
      for point2 in cluster:
        intra_cluster_distance_sum += distance(point1, point2)

#return average intra-cluster distance
return intra_cluster_distance_sum / (len(clusters) * len(clusters[0]))

In [None]:
def inter_cluster_separation(clusters):
#initialize variable to store sum of inter-cluster distances
  inter_cluster_distance_sum = 0

#calculate inter-cluster distances for each pair of clusters and add to sum
  for cluster1 in clusters:
    for cluster2 in clusters:
      if cluster1 != cluster2:
        for point1 in cluster1:
          for point2 in cluster2:
            inter_cluster_distance_sum += distance(point1, point2)

#return average inter-cluster distance
return inter_cluster_distance_sum / (len(clusters) * len(clusters[0]) * (len(clusters) - 1))

In [None]:
def dunn_validity_index(clusters):

  #calculate intra-cluster separation
  intra_cluster_sep = intra_cluster_separation(clusters)

  #calculate inter-cluster separation
  inter_cluster_sep = inter_cluster_separation(clusters)

  #calculate and return Dunn Validity Index
return inter_cluster_sep / intra_cluster_sep

In [None]:
def cluster_accuracy(data, labels, clusters):

  #initialize variable to store number of correctly classified points
  correct_classifications = 0

  #create a list of cluster assignments for each point in the data
  cluster_assignments = [0 for _ in range(len(data))]
  for i, cluster in enumerate(clusters):
    for point in cluster:
    cluster_assignments[data.index(point)] = i

  #compare actual labels to cluster assignments and add to correct classifications count if they match
  for i in range(len(data)):
    if labels[i] == cluster_assignments[i]:
      correct_classifications += 1

#return accuracy as a percentage
return correct_classifications / len(data) * 100

In [None]:
def adjusted_rand_index(data, labels, clusters):

  #create a list of cluster assignments for each point in the data
  cluster_assignments = [0 for _ in range(len(data))]
  for i, cluster in enumerate(clusters):
    for point in cluster:
      cluster_assignments[data.index(point)] = i

  #calculate number of pairs of points that are in the same cluster and have the same label
  same_cluster_same_label = 0
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if cluster_assignments[i] == cluster_assignments[j] and labels[i] == labels[j]:
        same_cluster_same_label += 1

  #calculate number of pairs of points that are in the same cluster and have different labels
  same_cluster_diff_label = 0
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if cluster_assignments[i] == cluster_assignments[j] and labels[i] != labels[j]:
        same_cluster_diff_label += 1

  #calculate number of pairs of points that are in different clusters and have the same label
  diff_cluster_same_label = 0
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if cluster_assignments[i] != cluster_assignments[j] and labels[i] == labels[j]:
        diff_cluster_same_label += 1

  #calculate number of pairs of points that are in different clusters and have different labels
  diff_cluster_diff_label = 0
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if cluster_assignments[i] != cluster_assignments[j] and labels[i] != labels[j]:
        diff_cluster_diff_label += 1

  #calculate expected index
  expected_index = (same_cluster_same_label + diff_cluster_diff_label) / (same_cluster_same_label + same_cluster_diff_label + diff_cluster_same_label + diff_cluster_diff_label)

  #calculate observed index
  observed_index = (same_cluster_same_label + diff_cluster_diff_label) / (same_cluster_same_label + same_cluster_diff_label + diff_cluster_same_label + diff_cluster_diff_label)

  #calculate and return adjusted rand index
return (observed_index - expected_index) / (1 - expected_index)

In [None]:
def normalised_mutual_information(data, labels, clusters):

  #reate a list of cluster assignments for each point in the data
  cluster_assignments = [0 for _ in range(len(data))]
  for i, cluster in enumerate(clusters):
    for point in cluster:
      cluster_assignments[data.index(point)] = i

  #calculate mutual information
  mutual_info = 0
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if labels[i] == labels[j] and cluster_assignments[i] == cluster_assignments[j]:
        mutual_info += 1

  #calculate entropy of labels
  label_entropy = 0
  for label in set(labels):
    label_prob = labels.count(label) / len(labels)
    label_entropy -= label_prob * math.log(label_prob)

  #calculate entropy of cluster assignments
  cluster_entropy = 0
  for cluster_assignment in set(cluster_assignments):
    cluster_assignment_prob = cluster_assignments.count(cluster_assignment) / len(cluster_assignments)
    cluster_entropy -= cluster_assignment_prob * math.log(cluster_assignment_prob)

  #calculate and return normalised mutual information
return mutual_info / math.sqrt(label_entropy * cluster_entropy)

In [None]:
def clustering_stability(data, labels, clusters, num_repeats):

  #initialize variable to store sum of cluster accuracies
  accuracy_sum = 0

  #repeat clustering multiple times and add accuracy to sum
  for i in range(num_repeats):
    accuracy_sum += cluster_accuracy(data, labels, clusters)

#return average accuracy
return accuracy_sum / num_repeats