# Implementasi DBSCAN

### Pembacaan Data

In [353]:
import pandas as pd
import heapq
from collections import Counter
from sklearn.metrics import confusion_matrix

iris = pd.read_csv('iris.csv')
#source: https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv

### Pemisahan Data Training dan Label

In [354]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

data_train = np.array(iris.iloc[:, 0:4])
iris_label = iris.iloc[:,-1]
species_encoder = LabelEncoder().fit(iris_label)
iris_label_encoded = species_encoder.transform(iris_label)

### Implementasi Kelas DBSCAN

In [355]:
class DBSCAN:

    def __init__(self, eps=2, minPts=2):
        self.data = None
        self.data_len = None
        self.label = None
        self.eps = eps
        self.minPts = minPts
    
    def euclidean(self, Q, P):
        return np.linalg.norm(self.data[Q]-self.data[P])
    
    def n_neighbors(self, i):
        Neighbors = []
        for j in range(self.data_len):
            if self.euclidean(i, j) <= self.eps:
                Neighbors.append(j)
        return Neighbors

    def DFS(self, S, j, C):
        if self.label[S[j]] == -1: self.label[S[j]] = C
        if self.label[S[j]] == None:
            self.label[S[j]] = C
            N = self.n_neighbors(S[j])
            if len(N) >= self.minPts:
                for i in range(len(N)):
                    S.append(N[i])
            if len(S) != j+1:
                self.DFS(S,j+1,C)
    
    def fit(self, data_train):
        self.data = data_train
        self.data_len = len(self.data)
        self.label = [None] * self.data_len
        C = 0
        for i in range(self.data_len):
            if self.label[i] != None: continue
            N = self.n_neighbors(i)
            if len(N) < self.minPts:
                self.label[i] = -1
                continue

            C = C + 1
            self.label[i] = C
            S = [x for x in N if not x == i]
            if len(S) != 0:
                self.DFS(S,0,C)
        return self

### Percobaan

In [360]:
clf = DBSCAN(1,2)
dbscan = clf.fit(data_train)
mat = confusion_matrix(dbscan.label, iris_label_encoded)

pd.crosstab(iris_label, np.array(dbscan.label))

col_0,1,2,3,4,5,6,7,8,9,10,...,28,29,30,31,32,33,34,35,36,37
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
setosa,48,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
versicolor,0,0,0,15,28,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
virginica,0,0,0,6,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [366]:
purity = float(mat[1].max() + mat[5].max() + mat[13].max()) / float(mat.sum())

print("Purity: ", purity)

('Purity: ', 0.64)


### Perbandingan dengan DBSCAN dari Sklearn

In [371]:
from sklearn.cluster import DBSCAN as SklearnDBSCAN

dbscan = SklearnDBSCAN(eps=1, min_samples=2).fit(data_train)
mat = confusion_matrix(dbscan.labels_, iris_label_encoded)

pd.crosstab(iris_label, dbscan.labels_)

col_0,0,1
species,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,50,0
versicolor,0,50
virginica,0,50


In [372]:
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())

print("Purity: ", purity)

('Purity: ', 0.6666666666666666)
