# Implementasi K-Means

### Pembacaan Data

In [54]:
import pandas as pd
import random

iris = pd.read_csv('data/iris.csv')

### Pemisahan Data Training dan Label

In [55]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

data_train = np.array(iris.iloc[:, 0:4])
iris_label = iris.iloc[:,-1]
species_encoder = LabelEncoder().fit(iris_label)
iris_label_encoded = species_encoder.transform(iris_label)

### Implementasi Kelas KMeans

In [56]:
class KMeans:
    
    # k -> jumlah kelas
    # eps -> epsilon atau toleransi error. akan menjadi threshold jika error < eps maka sudah konvergen
    # max_iter -> jumlah iterasi maksimal yang diperbolehkan
    def __init__ (self, init_method="random", k=2, eps=0.1, max_iter=100):
        self.init_method = init_method
        self.k = k
        self.eps = eps
        self.max_iter = max_iter
        self.centroids = []
        self.data = None
        self.labels = None
        
    def fit(self, data):
        self.data = data
        self.initCentroid()
        i = 0
        error = 1
        while error > self.eps and i <= self.max_iter :
            self.initClassClassification()
            self.assignClassClassification()
            prev_centroids = self.centroids
            current_centroids = self.calculateCentroid()
            error = self.calculateError(current_centroids)
            if error > self.eps :
                self.centroids = current_centroids
            i += 1
        self.labels = []
        for i in range(len(data)) :
            for j in range(self.k) :
                if i in self.classifications[j] :
                    self.labels.append(j)
        return self
            
        
    def initCentroid(self):
        idx_centroids = []
        if self.init_method == "random" :
             for i in range(self.k):
                idx = random.randint(0, len(self.data)-1)
                while (idx in idx_centroids):
                    idx = random.randint(0, len(self.data))
                self.centroids.append(self.data[idx])
                idx_centroids.append(idx)
        else :
            section = len(self.data) / self.k
            for i in range(self.k) :
                idx = int(i*section)
                self.centroids.append(self.data[idx])
    
    def initClassClassification(self) :
        self.classifications = {}
        for i in range (self.k):
            self.classifications[i] = []
            
    def assignClassClassification(self) :
        for i in range (len(self.data)) :
            distances = [self.calculateDistance(self.data[i], centroid) for centroid in self.centroids]
            classification = distances.index(min(distances))
            self.classifications[classification].append(i)
            
    def calculateDistance(self, vec1, vec2) :
        return np.sqrt(np.sum([(v1-v2)**2 for v1, v2 in zip(vec1, vec2)]))
    
    def calculateCentroid(self) :
        centroids = []
        for i in range(self.k) :
            centroid = []
            for j in range(len(self.data[0]))  :
                centroid.append(np.average(self.data[self.classifications[i],j]))
            centroids.append(np.array(centroid))
        return centroids
    
    def calculateError(self, centroids) :
        error = 0
        for i in range (len(self.data)) :
            distance = self.calculateDistance(self.data[i], centroids[self.getClusterIndex(i)])
            error += distance
        return error
    
    def getClusterIndex(self, idx) :
        for k in range(self.k) :
            if idx in self.classifications[k] :
                return k

### Percobaan

In [57]:
clf = KMeans(init_method="distribute", k=3)
km = clf.fit(data_train)
mat = confusion_matrix(km.labels, iris_label_encoded)

pd.crosstab(iris_label, np.array(km.labels))

col_0,0,1,2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,50,0,0
versicolor,0,48,2
virginica,0,14,36


In [58]:
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())

print("Purity: ", purity)

Purity:  0.8933333333333333


### Perbandingan dengan K-Means dari Sklearn

In [61]:
from sklearn.cluster import KMeans as SklearnKmeans
from sklearn.metrics import confusion_matrix

km = SklearnKmeans(n_clusters=3).fit(data_train)
mat = confusion_matrix(km.labels_, iris_label_encoded)

pd.crosstab(iris_label, km.labels_)

col_0,0,1,2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,50,0,0
versicolor,0,48,2
virginica,0,14,36


In [60]:
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())

print("Purity: ", purity)

Purity:  0.8933333333333333
