### Fuzzy C Means Clustering

In [None]:
#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

In [None]:
#load the dataset with 3 features
BSOM_data_3=pd.read_csv('BSOM_DataSet_revised.csv',usecols = ['avg_exam1', 'avg_exam2', 'exam3_final'])
BSOM_data_3.head()

In [None]:
features_X_3 = BSOM_data_3.iloc[:,:].values

In [None]:
#initialize the membership matrix such that for each datapoint, sum of its membership in all the clusters is equal to 1.
def init_membership(num_data,num_clusters):
    Mem_matrix = np.zeros((num_data,num_clusters))
    for i in range(0,num_data):
        for j in range(0,num_clusters):
            Mem_matrix[i][j]=np.random.random()
        Mem_matrix[i]=Mem_matrix[i]/np.sum(Mem_matrix[i])
    return Mem_matrix

In [None]:
#calculate the centroids from the initialised membership matrix
#fuzzifier m=2
def calc_centroid(features,mem_matrix,m=2):
    centroids={}
    
    for k in range(0,mem_matrix.shape[1]):
        temp_centroid=[]
        for i in range(0,features.shape[1]):
        
            temp_num=np.sum(np.dot(mem_matrix[:,k]**m,features[:,i]))
            temp_den=np.sum(mem_matrix[:,k]**m)

            temp_centroid.append(temp_num/temp_den)

        centroids[k]=np.array(temp_centroid)
               
         
    return centroids

In [None]:
# calculate euclidean distance from data point to centroid
def calc_distance(datapoint,centroid):
    eucl_dist=np.sqrt(np.sum((datapoint-centroid)**2))
    return eucl_dist

In [None]:
#update the membership matrix
def update_membership(feature,centroid,m=2):
    exp=2/(m-1)
    updated_mem=np.zeros((feature.shape[0],len(centroid.keys())))
    for i in range(0,feature.shape[0]):
        for j in range(0,len(centroid.keys())):
            num=(calc_distance(feature[i],centroid[j])**exp)
            sum_mem=0
            for k in range(0,len(centroid.keys())):
                den=calc_distance(feature[i],centroid[k])
                sum_mem=sum_mem+(1/(den)**exp)
            mem=1/(num*sum_mem)
            updated_mem[i,j]=mem
    return updated_mem

In [None]:
#returns the hard labels of the fuzzy C Means from the final memberships
def fuzzy_hardlabels(mem_matrix):
    feature_lables = []
    clusters={}
    for i in range(0,mem_matrix.shape[0]):
        max_mem, index = max((mem, index) for (index, mem) in enumerate(mem_matrix[i]))
        feature_lables.append(index)
    return feature_lables

In [None]:
#Fuzzy C Means Algorithm
#convergence condition is error between the previous and the current membership matrices is 0.01
def FuzzyCMeans(k,features_X,iterations=100):
    mem_matrix=init_membership(features_X.shape[0],k)
    centroids=calc_centroid(features_X,mem_matrix)
    prev_mem_matrix=np.zeros(mem_matrix.shape)
    count=1
    feature_labels=[]
    converge=False

    while(count<=iterations):
        feature_labels=[]
                      
        prev_mem_matrix=np.array(mem_matrix,copy=True)
        mem_matrix=update_membership(features_X,centroids)
        centroids=calc_centroid(features_X,mem_matrix)
                
        if np.linalg.norm(np.subtract(prev_mem_matrix,mem_matrix))<0.01:
            #print("\n error is :",str(np.linalg.norm(np.subtract(prev_mem_matrix,mem_matrix))))
            converge=True
            print("convergence codition is reached at "+str(count)+" iterations\n")
            break
        count+=1
            
       
    return mem_matrix,centroids       

In [None]:
#takes the number of clusters and data points as input and returns the labels and cluster centers of FuzzyCMeans algorithm
def testFuzzyMeans(k,features_X):

    final_mem_matrix,centroids_list=FuzzyCMeans(k,features_X)
    final_centroids_list=[]
    for i in range(0,len(centroids_list.keys())):
              final_centroids_list.append(centroids_list[i])
    print(final_centroids_list)    
    final_label_list=fuzzy_hardlabels(final_mem_matrix)
    print("Final cluster centroids :\n",str(final_centroids_list))
    return final_centroids_list,final_label_list

In [None]:
#assign final cluster labels to datapoints
def assign_final_labels(final_centroids_list,final_label_list,features_X):
    data_with_labels={}
    for i in range(0,len(final_label_list)):
        x=final_label_list[i]
        if x in data_with_labels.keys():
            data_with_labels[x].append(features_X[i])
        else:
            data_with_labels[x]=[]
            data_with_labels[x].append(features_X[i])
    return data_with_labels

In [None]:
#finding the average distance of the points within the cluster
def average_dist(data,centroid,cluster_size):
    sum_dist=0
    for i in data:
        sum_dist=sum_dist+calc_distance(i,centroid)
    average_dist=sum_dist/cluster_size
    return average_dist

In [None]:
#Calculate DB Index
def DBIndex(final_centroids_list,data_labels):
    Ri=[]
    for i in range(0,len(final_centroids_list)):
        
        cluster_size_i=len(data_labels[i])
        si=average_dist(data_labels[i],final_centroids_list[i],cluster_size_i)
        temp_dist=[]
        for j in range(0,len(final_centroids_list)):
            if i!=j:
                cluster_size_j=len(data_labels[j])
                sj=average_dist(data_labels[j],final_centroids_list[j],cluster_size_j)

                dist_bw_clusters=calc_distance(final_centroids_list[i],final_centroids_list[j])
                Rij=(si+sj)/dist_bw_clusters
                temp_dist.append(Rij)
        Ri.append(max(temp_dist))
    DBsum=0
    for i in Ri:
        DBsum=DBsum+i
    DBI=DBsum/len(final_centroids_list)
    return DBI

#### Testing and plotting

In [None]:
#plot the 3D plot of Fuzzy C means clusters with optimal number of clusters(3) and best number
#of features(3) obtained in k-Means clustering

final_centroids_list,final_label_list=testFuzzyMeans(3,features_X_3)
fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1,1,1, projection='3d')
feature1 = np.array(BSOM_data_3['avg_exam1'])
feature2 = np.array(BSOM_data_3['avg_exam2'])
feature3 = np.array(BSOM_data_3['exam3_final'])
centroids_plot=np.array(final_centroids_list)


ax.scatter(centroids_plot[:,0],centroids_plot[:,1],centroids_plot[:,2], c='Red', s=200, alpha=1)
ax.scatter(feature1,feature2,feature3, c=final_label_list, s=40, cmap="viridis")

plt.show()

In [None]:
#Calculate DBI for optimum clusters(3) and best #features obtained from K Means
print("optimum number of clusters :",str(3),"best number of features : ",str(3))
final_centroids_list,final_label_list=testFuzzyMeans(3,features_X_3)
data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_3)
Dbi=DBIndex(final_centroids_list,data_labels)
print("DBIndex : clusters ",str(3)," is ",str(Dbi))

In [None]:
#Calculate DBI for optimum clusters(2) and best #features obtained from K Means
print("optimum number of clusters :",str(2),"best number of features : ",str(3))
final_centroids_list,final_label_list=testFuzzyMeans(2,features_X_3)
data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_3)
Dbi=DBIndex(final_centroids_list,data_labels)
print("DBIndex : clusters ",str(2)," is ",str(Dbi))

In [None]:
#plotting #clusters 2 to 10 and DBI for 3 features
DBI_list=[]
for i in range(2,11):
    
    print("number of clusters :",str(i))
    final_centroids_list,final_label_list=testFuzzyMeans(i,features_X_3)
    #print(final_label_list)
    data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_3)

    Dbi=DBIndex(final_centroids_list,data_labels)
    DBI_list.append(Dbi)
    print("DBIndex : cluster ",str(i)," is ",str(Dbi))
clusters=np.arange(2,11)
DB_Index=np.array(DBI_list)
plt.plot(clusters,DB_Index)
plt.title('Fuzzy-C-Means with 3 features')
plt.xlabel('#clusters')
plt.ylabel('DB_Index')
plt.show()

#### Fuzzy clustering with 4 features

In [None]:
#load the dataset with 4 features
BSOM_data_4=pd.read_csv('BSOM_DataSet_revised.csv',usecols = ['avg_exam1', 'avg_exam2', 'exam3_final','avg_exam4'])
BSOM_data_4.head()

In [None]:
features_X_4=BSOM_data_4.iloc[:,:].values

In [None]:
#Calculate DBI for optimum clusters and best #features obtained from K Means
print("optimum number of clusters :",str(3)," number of features : ",str(4))
final_centroids_list,final_label_list=testFuzzyMeans(3,features_X_4)
data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_4)
Dbi=DBIndex(final_centroids_list,data_labels)
print("DBIndex : clusters ",str(3)," is ",str(Dbi))

In [None]:
#Calculate DBI for optimum clusters and best #features obtained from K Means
print("optimum number of clusters :",str(2)," number of features : ",str(4))
final_centroids_list,final_label_list=testFuzzyMeans(2,features_X_4)
data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_4)
Dbi=DBIndex(final_centroids_list,data_labels)
print("DBIndex : clusters ",str(2)," is ",str(Dbi))

In [None]:
#plotting #clusters 2 to 10 and DBI for 4 features
DBI_list=[]
for i in range(2,11):
    
    print("number of clusters :",str(i))
    final_centroids_list,final_label_list=testFuzzyMeans(i,features_X_4)
    #print(final_label_list)
    data_labels=assign_final_labels(final_centroids_list,final_label_list,features_X_4)

    Dbi=DBIndex(final_centroids_list,data_labels)
    DBI_list.append(Dbi)
    print("DBIndex : cluster ",str(i)," is ",str(Dbi))
clusters=np.arange(2,11)
DB_Index=np.array(DBI_list)
plt.plot(clusters,DB_Index)
plt.title('Fuzzy-C-Means with 4 features')
plt.xlabel('#clusters')
plt.ylabel('DB_Index')
plt.show()