In [1]:
import numpy as np
import pandas as pd
import scipy 
import sklearn
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
from scipy import spatial

In [None]:
data = pd.read_csv(r"/content/data.csv",header=None)

In [None]:
labels = pd.read_csv(r"/content/label.csv",names=['label'],header=None)

In [None]:
train_data = data.iloc[:8000]
test_data = data.iloc[8000:]
train_labels = labels.iloc[:8000]
test_labels = labels.iloc[8000:]

In [None]:
class KMeans:
    
    def calculate_SSE(self, centroid_value_dict, centroid_dict,data):
        #centroid_value_dict - dictionary of centroids
        #centroid_dict - dict of centroids keys and data points indexes
        sse_data = 0
        for i in centroid_dict:
            sse_cluster = 0
            # np.sum()
            for j in centroid_dict[i]:
                dp = list(data.iloc[int(j)])
                for a,b in zip(centroid_value_dict[i],dp):
                    sse_cluster += (a-b)**2
            sse_data+=sse_cluster
        return sse_data    
    
    def Initialize_Centroids(self,data,K):
        m = data.shape[0]
        centroid_value_dict={}
        for i in range(K):
            r = np.random.randint(0, m-1)
            centroid_value_dict[i] = data.iloc[r]
        return centroid_value_dict
        #return centroid_list,centroid_dict
    
    def jaccard_similarity(self,centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

    def train_Kmeans(self,data,K,max_iter=20,mode=1,tol=10):
        #Mode = 1 => Euclidean np.linalg.norm(x-list(data.iloc[i,:]))
        #Mode = 2 => Jaccard
        #Mode = 3 => Cosine
        centroid_value_dict = self.Initialize_Centroids(data,K)
        new_centroid_value_dict = {}
        count = 0
        centroid_dict = {}
        convergence = False
        while((count<max_iter) and not convergence):
            
            for i in list(centroid_value_dict.keys()):
                centroid_dict[i]=[]
            for i in range(data.shape[0]):
                x = data.iloc[i]
                if mode==1 :
                    distance_measure = [np.linalg.norm(x-centroid_value_dict[j])  for j in centroid_value_dict]
                    idx = np.argmin(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==2 :
                    distance_measure = [self.jaccard_similarity(list(x),centroid_value_dict[j]) for j in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==3 :
                    distance_measure = [1-scipy.spatial.distance.cosine(x,list(centroid_value_dict[j]))  for j in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                
                prev_centroids=dict(centroid_value_dict)
                
            
            for i in centroid_dict:
                if len(centroid_dict[i]):
                    # print(centroid_dict[i])
                    dps_centroid = centroid_dict[i]
                    centroid_value_dict[i] = np.average(data.iloc[dps_centroid],axis=0)
                    #new_centroid = np.zeros(shape = (data.shape[1],))
                    # for j in (temp_dict[i]).astype('int'):
                    #     new_centroid = [new_centroid[i]+list(data.iloc[j,:])[i] for i in range(0,len(list(new_centroid)))]
                    # new_centroid = [int(c/len(temp_dict[i])) for c in new_centroid]
                    
                # print(i)
            
            
            current_tol=-1
            for i in centroid_value_dict:
                prev_centroid_point = prev_centroids[i]
                new_centroid_point = centroid_value_dict[i]
                change = np.sum(np.absolute(new_centroid_point-prev_centroid_point))
                current_tol = max(change, current_tol)
                
            print("Iteration ",count,": ",current_tol)
                
                # lst=[]
                # for j in range(0,len(list(centroid_value_dict[i]))):
                #     if centroid_value_dict[i][j]!=0:
                #         # dummy = (centroid_value_dict[i])
                #         lst.append((int(new_centroid_value_dict[i][j])-centroid_value_dict[i][j])/centroid_value_dict[i][j])*100
                #     else:
                #         lst.append(0)
                # g += np.sum(lst)/len(new_centroid_value_dict[i])
            # change = g/len(new_centroid_value_dict)
            # if change<10:
            #     break
            # centroid_value_dict =  new_centroid_value_dict
            
            count+=1
            if (current_tol<10):
                convergence = True
                break
           # print("KMeans Iteration",count)
        return centroid_value_dict,centroid_dict
    

In [None]:
def predict_cluster_labels(C, S, labels):
    '''
    Input : C -> Centroids
            S -> Set of Indicies corresponding to Centroid C
            data -> Data used to form clusters
    Output : Returns an array of size K having labels based on majority voting in the cluster
    '''
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels

In [None]:
def jaccard_similarity(centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

In [None]:
def accuracy(centroids, centroid_Labels, test_data, true_labels, mode=1):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if mode==1:
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif mode==3:
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
        elif mode==2:
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    accuracy = correctly_classified/denominator
    return accuracy

In [None]:
model1 = KMeans()
centroids1,clusters1 = model1.train_Kmeans(data,10, max_iter=100,mode=1)
Euclidean_SSE = model1.calculate_SSE(centroids1,clusters1,data)
print("Euclidean SSE for the dataset is:",Euclidean_SSE)

Iteration  0 :  24641.337320574166
Iteration  1 :  4766.558924118875
Iteration  2 :  3294.5984037657036
Iteration  3 :  2266.525141185276
Iteration  4 :  1241.3651282942465
Iteration  5 :  936.2173174048173
Iteration  6 :  835.5084561909478
Iteration  7 :  632.2531239838343
Iteration  8 :  719.7078260163994
Iteration  9 :  514.8514715094602
Iteration  10 :  574.8490566037736
Iteration  11 :  540.948343511569
Iteration  12 :  564.4357898531288
Iteration  13 :  650.0534373394698
Iteration  14 :  514.4732588957786
Iteration  15 :  503.75334889695375
Iteration  16 :  399.7028197003806
Iteration  17 :  397.87348490107615
Iteration  18 :  312.9782992125985
Iteration  19 :  213.2183915147771
Iteration  20 :  170.7448421676362
Iteration  21 :  183.62581173223657
Iteration  22 :  126.38525461731297
Iteration  23 :  94.66733870967757
Iteration  24 :  79.85712743291494
Iteration  25 :  83.64729345789766
Iteration  26 :  102.07994129858963
Iteration  27 :  137.85152838427953
Iteration  28 :  120.7

In [None]:
cluster_labels1 = predict_cluster_labels(centroids1,clusters1,labels)
cluster_labels1

array([6, 8, 4, 0, 7, 1, 3, 2, 0, 9])

In [None]:
Accuracy_Euclidean = accuracy(centroids1,cluster_labels1,test_data,test_labels)
Accuracy_Euclidean

0.6395

In [None]:
model2 = KMeans()
centroids2,clusters2 = model2.train_Kmeans(data, 10, max_iter=100, mode=2)
Jaccard_SSE = model2.calculate_SSE(centroids2, clusters2, data)
print("Jacard SSE for the dataset is:",Jaccard_SSE)

Iteration  0 :  39569.61111111111
Iteration  1 :  18777.948168195682
Iteration  2 :  16098.532799999999
Iteration  3 :  2972.331197041358
Iteration  4 :  852.7300989045384
Iteration  5 :  959.2396615664843
Iteration  6 :  772.3924000000002
Iteration  7 :  793.6394351313969
Iteration  8 :  1485.8107099656359
Iteration  9 :  795.7341245657572
Iteration  10 :  960.3338175951601
Iteration  11 :  867.0907951367781
Iteration  12 :  650.1208528046678
Iteration  13 :  0.0
Jacard SSE for the dataset is: 34364573698.62264


In [None]:
cluster_labels2 = predict_cluster_labels(centroids2,clusters2,labels)
cluster_labels2

array([1, 3, 7, 5, 5, 1, 3, 3, 8, 7])

In [None]:
Accuracy_Jaccard = accuracy(centroids2, cluster_labels2,test_data,test_labels)
Accuracy_Jaccard

0.115

In [None]:
model3 = KMeans()
centroids3,clusters3 = model3.train_Kmeans(data,10,max_iter = 100,mode=3)
Cosine_SSE = model3.calculate_SSE(centroids3,clusters3,data)
print("Cosine SSE for the dataset is:",Cosine_SSE)

Iteration  0 :  29641.134517766495
Iteration  1 :  9576.53135764944
Iteration  2 :  6785.850655693832
Iteration  3 :  3276.140481711816
Iteration  4 :  1966.9653830029345
Iteration  5 :  1179.5775232159585
Iteration  6 :  942.0550913568233
Iteration  7 :  1332.5876751139838
Iteration  8 :  1869.6141566642798
Iteration  9 :  2227.5347987021887
Iteration  10 :  1977.702185474166
Iteration  11 :  1235.2881827043166
Iteration  12 :  594.9379652809802
Iteration  13 :  393.3571384142738
Iteration  14 :  410.4846118217473
Iteration  15 :  284.5759157980631
Iteration  16 :  302.4935290948854
Iteration  17 :  284.2165068852952
Iteration  18 :  239.32770170381048
Iteration  19 :  177.62312260509952
Iteration  20 :  149.56044757689762
Iteration  21 :  176.70778996712147
Iteration  22 :  140.69894838817763
Iteration  23 :  132.865204057561
Iteration  24 :  91.23446492356513
Iteration  25 :  59.193840260763835
Iteration  26 :  96.07474170881108
Iteration  27 :  92.10725552050488
Iteration  28 :  35

In [None]:
cluster_labels3 = predict_cluster_labels(centroids3,clusters3,labels)
cluster_labels3

array([3, 6, 5, 4, 8, 0, 1, 2, 1, 7])

In [None]:
Accuracy_Cosine = accuracy(centroids3, cluster_labels3,test_data,test_labels)
Accuracy_Cosine

0.649