<a href="https://colab.research.google.com/github/Fatimahasn/K-means-Clustering/blob/main/k_means_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics.pairwise import euclidean_distances

#functions "inter_cluster_distances" and "dunn" are for computing dunn index
def inter_cluster_distances(labels, distances, method='nearest'):
    farthest=False
    n_unique_labels = len(np.unique(labels))
    cluster_distances = np.full((n_unique_labels, n_unique_labels),float('inf') if not farthest else 0)
    np.fill_diagonal(cluster_distances, 0)
    for i in np.arange(0, len(labels) - 1):
        for j in np.arange(i, len(labels)):
            if labels[i] != labels[j] and (
                (not farthest and distances[i, j] < cluster_distances[labels[i], labels[j]]) or
                (farthest and distances[i, j] > cluster_distances[labels[i], labels[j]])):
                cluster_distances[labels[i], labels[j]] = cluster_distances[labels[j], labels[i]] = distances[i,j]
    return cluster_distances


def dunn(labels, distances, diameter_method='farthest',cdist_method='nearest'):

    ic_distances = inter_cluster_distances(labels, distances, cdist_method)
    min_distance = min(ic_distances[ic_distances.nonzero()])

    n_clusters = len(np.unique(labels))
    diameters = np.zeros(n_clusters)
    for i in range(0, len(labels) - 1):
        for j in range(i + 1, len(labels)):
            if labels[i] == labels[j] and distances[i, j] > diameters[labels[i]]:
                diameters[labels[i]] = distances[i, j]
    max_diameter = max(diameters)

    return min_distance / max_diameter
def Euclidean_Distance(num_1, num_2):
    #Computes Euclidean Distance of two points
    ED = sum((num_1 - num_2)**2)**0.5
    return ED

def minimum_index(Distance):
    min_D=Distance.index(min(Distance))
    return min_D

def Kmeans(X,k):
    d=euclidean_distances(X)
    centroids = {}
    
    #Set first three indexes of a dataframe as centroids
    for i in range(k):
        centroids[i] = X[i]
            
    for iteration in range(15):
        old_Centroid = centroids
        clusters = {}  
        for cluster_index in range(k):
            clusters[cluster_index ] = []
            fit=[]
            
        #Compute distance of each data point with centroids and find the minimum distance 
        for data in X:
            Distance = []
            for c_1 in centroids:
                CD = Euclidean_Distance(data, centroids[c_1])
                Distance.append(CD)

            Label=minimum_index(Distance)
                
            #fit contains cluster numbers corresponding to inpt X
            fit.append(Label)
            clusters[Label].append(data)
                
        #For loop implemented below calculates the new clusters by computing mean of all the datapoints in a cluster      
        for cluster_index in clusters:
            cluster_vals = clusters[cluster_index]
            New_Centroid = np.mean(cluster_vals, axis = 0)
            centroids[cluster_index] = New_Centroid
           
        #if condition checks whether the new and old centroids are same or not
        #If they are same, break the loop otherwise continue loop by using newcentroids
        if np.all(old_Centroid == New_Centroid):
                break
        else:
                old_Centroid == New_Centroid
                    
    print("centroids for k= ",k)
    print(centroids)
       
    #code for printing values corresponding to each cluster
    for cluster_index in clusters:
        cluster_vals = clusters[cluster_index]
        #print ("cluster ",cluster_index)
        #print("values for cluster: ",cluster_vals)
           
    DBI=davies_bouldin_score(X,fit)
    print("DBI for k=",k," :",DBI)
    DI = dunn(fit, d, diameter_method="farthest", cdist_method="nearest")
    print("DI for k =",k," :",DI)
       
    print("*****************************************************************************************")

In [16]:
# Function selects the dataset depending upon the user's input
def dataset_selection(choice,status):
    if(choice == 1):
        data = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
        df = pd.read_csv(data, sep=',')
        df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
        Features = df.values[:, 0:4]
        status=True
    elif(choice == 2):
        data='http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data'
        df = pd.read_csv(data, sep=',')
        attributes = ["Id number", "RI", "Na", "Mg", "Al","Si","K","Ca","Ba","Fe","Type of glass"]
        df.columns = attributes
        Features = df.values[:,0:10]
        status=True
    else:
        print("Incorrect Value Entered")

    return Features, status

In [17]:
def ComputeClusters(Features):
    value=int(input("Enter the number of clusters. The number should be greater than 1. "))
    if (value > 1): 
      Kmeans(Features,k)
    else:
      print("Invalid input!")

In [18]:
if __name__ == "__main__":
  print("Choose the dataset: ")
  print("(1): Iris Dataset ")
  print("(2): Glass dataset ")

  # Variable to check whether the data is loaded or not.
  temp=False
  while(temp!=True):
    ch=int(input("Press 1 or 2: "))
    Features,temp = dataset_selection(ch,temp)

  # Compute clusters 
  while(True):
    ComputeClusters(Features)
    inp=input("Do you want to check values for more clusters. If yes press 'Y' else press any other key: ")
    if(inp=='Y'):
      continue
    else:
      break 
  
  print("Code Execution Completed")
  print("**************************************************************************************************")


Choose the dataset: 
(1): Iris Dataset 
(2): Glass dataset 
Press 1 or 2: 2
Enter the number of clusters. The number should be greater than 1. 4
centroids for k=  2
{0: array([5.45000000e+01, 1.51833038e+00, 1.31748113e+01, 3.42169811e+00,
       1.27933962e+00, 7.26416038e+01, 4.90094340e-01, 8.75783019e+00,
       4.04716981e-02, 6.31132075e-02]), 1: array([1.61000000e+02, 1.51837542e+00, 1.36365421e+01, 1.93738318e+00,
       1.61214953e+00, 7.26683178e+01, 5.08037383e-01, 9.15616822e+00,
       3.10000000e-01, 5.14953271e-02])}
DBI for k= 2  : 0.5032109030915135
DI for k = 2  : 0.040775340590132395
*****************************************************************************************
Do you want to check values for more clusters. If yes press 'Y' else press any other key: Y
Enter the number of clusters. The number should be greater than 1. 5
centroids for k=  2
{0: array([5.45000000e+01, 1.51833038e+00, 1.31748113e+01, 3.42169811e+00,
       1.27933962e+00, 7.26416038e+01, 4.900