### K-Means Clustering

In [None]:
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

In [None]:
#load the dataset with 3 features
student_data_3=pd.read_csv('students_DataSet_revised.csv',usecols = ['avg_exam1', 'avg_exam2', 'exam3_final'])
student_data_3.head()

In [None]:
features_X_3 = student_data_3.iloc[:,:].values

In [None]:
#check if the centroid already exists in the centroidlist
#used to make sure the initial choice of centroids are different from each other
def centroidexists(centroid, centroidlist):
    return next((True for point in centroidlist if np.array_equal(point, centroid)), False) 

In [None]:
#k is the number of centroids (clusters) to be initialised
#initial centroids are randomly chosen from data points
#returns the randomly initialized k centroids
def initial_centroids(k,features_X):

    centroids_rand=features_X[np.random.randint(features_X.shape[0], size=k), :]
    exit_loop=True
#select the random centroids such that they are different from each other
    while(exit_loop):
        exit_loop=False
        unique_centroids=[]
        for i in range(0,k):
            if centroidexists(centroids_rand[i],unique_centroids):
                centroids_rand=features_X[np.random.randint(features_X.shape[0], size=k), :]
                exit_loop=True
                break
            else:
                unique_centroids.append(centroids_rand[i])
            
    return centroids_rand

In [None]:
# calculate euclidean distance from data point to centroid
def calc_distance(datapoint,centroid):
    eucl_dist=np.sqrt(np.sum((datapoint-centroid)**2))
    return eucl_dist

In [None]:
#returns the dictinary containing keys as labels and values are the corresponding data points for each label
def assign_centroid(features_X,feature_labels):


    centroid_label={}
    index=0
    for i in feature_labels:
        
        if i in centroid_label.keys():
            
            centroid_label[i].append(features_X[index])
            
        else:
            centroid_label[i]=[]
           
            centroid_label[i].append(features_X[index])
            
        index+=1
    
    return centroid_label

In [None]:
#calculates the centroid with the new datapoints and returns the updated centroids
def update_centroids(centroid_label):
    new_centroids=[]
    
    
    for label in centroid_label:
        a=centroid_label[label]
        sum_arr=np.zeros(a[0].shape)
        for i in a:
            sum_arr=np.add(sum_arr,i)
        average=sum_arr/len(a)
        new_centroids.append(average)

    
    return new_centroids

In [None]:
#assign final cluster labels to datapoints 
def assign_final_labels(final_centroids_list,final_label_list,features_X):
    data_with_labels={}
    for i in range(0,len(final_label_list)):
        x=final_label_list[i]
        if x in data_with_labels.keys():
            data_with_labels[x].append(features_X[i])
        else:
            data_with_labels[x]=[]
            data_with_labels[x].append(features_X[i])
    return data_with_labels

In [None]:
#KMeans algorithm
#k= number of clusters and iterations= maximum  number of iterations for convergence
#when the centroids does not change, convergence condition is reached
def KMeans(k,features_X,iterations=100):
    centroids=initial_centroids(k,features_X)

    prev_centroids=np.zeros(centroids.shape)
    count=1
    feature_labels=[]
    assign_clusters={}
    converge=False

    while(count<=iterations):
        feature_labels=[]
        for i in features_X:
            distance=[]
            for j in range(0,len(centroids)):
                temp_distance=calc_distance(i,centroids[j])
                distance.append(temp_distance)

            feature_labels.append(distance.index(min(distance)))
       
        assign_clusters=assign_centroid(features_X,feature_labels)
       
        prev_centroids=np.array(centroids,copy=True)

        centroids=update_centroids(assign_clusters)

        if np.array_equal(prev_centroids,centroids):
            converge=True
            print("convergence codition is reached at "+str(count)+" iterations\n")
            break
        count+=1
   
    return feature_labels,centroids       
      

In [None]:
#takes the number of clusters and data points as input and returns the labels and cluster centers of KMeans algorithm
def testKMeans(k,features_X):

    final_label_list,final_centroids_list=KMeans(k,features_X)
   
    print("Final cluster centroids :\n",str(final_centroids_list))
    return final_centroids_list,final_label_list

In [None]:
#finding the average distance of the points within the cluster
def average_dist(data,centroid,cluster_size):
    sum_dist=0
    for i in data:
        sum_dist=sum_dist+calc_distance(i,centroid)
    average_dist=sum_dist/cluster_size
    return average_dist

In [None]:
#Calculate DB Index
def DBIndex(final_centroids_list,data_labels):
    Ri=[]
    for i in range(0,len(final_centroids_list)):
        
        cluster_size_i=len(data_labels[i])
        si=average_dist(data_labels[i],final_centroids_list[i],cluster_size_i)
        temp_dist=[]
        for j in range(0,len(final_centroids_list)):
            if i!=j:
                cluster_size_j=len(data_labels[j])
                sj=average_dist(data_labels[j],final_centroids_list[j],cluster_size_j)

                dist_bw_clusters=calc_distance(final_centroids_list[i],final_centroids_list[j])
                Rij=(si+sj)/dist_bw_clusters
                temp_dist.append(Rij)
        Ri.append(max(temp_dist))

    DBsum=0
    for i in Ri:
        DBsum=DBsum+i
    DBI=DBsum/len(final_centroids_list)
    return DBI
        
    

#### Testing and Plotting

In [None]:
#plotting the clusters and centroids using 3D-plot for 3 features
#Plotting DBI for the number of clusters 2 to 10 for 3 features
DBI_list=[]
for i in range(2,11):

    print("number of clusters :",str(i))
    final_centroids_list,final_label_list=testKMeans(i,features_X_3)
    data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_3)

    Dbi=DBIndex(final_centroids_list,data_labels)
    DBI_list.append(Dbi)

    print("DBIndex : cluster ",str(i)," is ",str(Dbi))
    
    
    fig = plt.figure(figsize=plt.figaspect(0.5))
    ax = fig.add_subplot(1,1,1, projection='3d')

    feature1 = np.array(BSOM_data_3['avg_exam1'])
    feature2 = np.array(BSOM_data_3['avg_exam2'])
    feature3 = np.array(BSOM_data_3['exam3_final'])
    centroids_plot=np.array(final_centroids_list)


    ax.scatter(centroids_plot[:,0],centroids_plot[:,1],centroids_plot[:,2], c='Red', s=200, alpha=1)
    ax.scatter(feature1,feature2,feature3, c=final_label_list, s=40, cmap="viridis")

    plt.show()
clusters=np.arange(2,11)
DB_Index=np.array(DBI_list)
plt.plot(clusters,DB_Index)
plt.title('K-Means with 3 features')
plt.xlabel('#clusters')
plt.ylabel('DB_Index')
plt.show()


#### K-Means with 4 features

In [None]:
#load the dataset with 4 features
student_data_4=pd.read_csv('students_DataSet_revised.csv',usecols = ['avg_exam1', 'avg_exam2', 'exam3_final','avg_exam4'])
student_data_4.head()

In [None]:
features_X_4=BSOM_data_4.iloc[:,:].values

In [None]:
#plotting DB Index for number of clusters 2 to 10 for 4 features
DBI_list=[]
for i in range(2,11):
    
    print("number of clusters :",str(i))
    final_centroids_list,final_label_list=testKMeans(i,features_X_4)
    data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_4)

    Dbi=DBIndex(final_centroids_list,data_labels)
    DBI_list.append(Dbi)
    print("DBIndex : cluster ",str(i)," is ",str(Dbi))
clusters=np.arange(2,11)
DB_Index=np.array(DBI_list)
plt.plot(clusters,DB_Index)
plt.title('K-Means with 4 features')
plt.xlabel('#clusters')
plt.ylabel('DB_Index')
plt.show()

#### K-Means with 5 features

In [None]:
#load the dataset with 5 features
BSOM_data_5=pd.read_csv('BSOM_DataSet_revised.csv',usecols = ['avg_exam1', 'avg_exam2', 'exam3_final','avg_exam4','exam5_final'])
BSOM_data_5.head()

In [None]:
features_X_5=BSOM_data_5.iloc[:,:].values

In [None]:
#plotting DB Index for number of clusters 2 to 10 for 5 features
DBI_list=[]
for i in range(2,11):
    
    print("number of clusters :",str(i))
    final_centroids_list,final_label_list=testKMeans(i,features_X_5)
    data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_5)

    Dbi=DBIndex(final_centroids_list,data_labels)
    DBI_list.append(Dbi)
    print("DBIndex : cluster ",str(i)," is ",str(Dbi))
clusters=np.arange(2,11)
DB_Index=np.array(DBI_list)
plt.plot(clusters,DB_Index)
plt.title('K-Means with 5 features')
plt.xlabel('#clusters')
plt.ylabel('DB_Index')
plt.show()