In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [14]:
class DBSCAN:
    
    def __init__(self, X, eps, min_pts):
        self.eps = eps
        self.min_pts = min_pts
        self.X = X
        self.n_rows = X.shape[0]
        
        #  0 - unclassified
        # -1 - Noise
        # >0 - cluster id the point belngs to
        self.lable = np.zeros(self.n_rows, dtype='int16')
        
        # preconpute distance matrix
        self.distances = euclidean_distances(self.X)
            
    def rangeQuery(self, pid):
        return np.where(self.distances[pid] <= self.eps)[0]
    
    def computeGini(self):
        Mj = self.conf_mat.sum(axis=1, keepdims=True)
        gj = 1 - ((self.conf_mat/Mj)**2).sum(axis=1, keepdims=True)
        return (sum(gj*Mj)/sum(Mj))[0]
    
    def computePurity(self):
        Pj = self.conf_mat.max(axis=1)
        Mj = self.conf_mat.sum(axis=1)
        return sum(Pj)/sum(Mj)
        
    def evaluateConfusion(self, y):
        c = len(np.unique(self.lable))
        d = len(np.unique(y))
        self.conf_mat = np.zeros(shape=(c - 1, d))
        
        # for noise (also change dimention of conf_mat)
        # self.conf_mat[0] = np.bincount(y[np.where(self.lable == -1)[0]].ravel(), minlength=d)
        
        # for clusters
        for cluster_id in range(1, c):
            true_labels = y[np.where(self.lable == cluster_id)[0]].ravel()
            self.conf_mat[cluster_id - 1] = np.bincount(true_labels, minlength=d)
    
    def fit(self):
        c_id = 0
        
        for n in range(self.n_rows):
            if self.lable[n] != 0:
                continue
            
            # get neighbours for point
            neighbours = self.rangeQuery(n)
            
            # density check
            if len(neighbours) < self.min_pts:
                self.lable[neighbours] = -1
                continue
            
            c_id += 1
            self.lable[n] = c_id
            
            # grow cluster
            idx = 0
            while idx < len(neighbours):
                _n = neighbours[idx]
                idx += 1
                
                # check if already processed
                if self.lable[_n] > 0:
                    continue
                    
                # add point to cluster
                self.lable[_n] = c_id
                _neighbours = self.rangeQuery(_n)
                
                # if core point, add to original neighbours
                if len(_neighbours) >= self.min_pts:
                    neighbours = np.concatenate((neighbours, _neighbours))

## FASHION

In [3]:
fashion_df = pd.read_csv('../data/fashion/fashion-mnist_train.csv')
y = fashion_df['label']
del fashion_df['label']
fashion_X = fashion_df.values
del fashion_df

Sample points and shuffle

In [4]:
label_idx = {}
sample_size = 600
for idx in range(len(np.unique(y))):
    label_idx[idx] = np.where(y == idx)[0]

np.random.seed(42)
for idx in range(len(label_idx)):
    label_idx[idx] = np.random.choice(label_idx[idx], sample_size, replace=False)

fashion_sub = np.empty((sample_size * len(label_idx), fashion_X.shape[1]))
y_sub = np.empty((sample_size * len(label_idx), 1), dtype='int16')
for idx in range(len(label_idx)):
    start = idx * sample_size
    end = (idx + 1) * sample_size
    fashion_sub[start:end] = fashion_X[label_idx[idx]]
    y_sub[start:end] = idx

s = np.arange(sample_size * len(label_idx))
np.random.shuffle(s)
fashion_sub = fashion_sub[s]
y_sub = y_sub[s]

DBSCAN on fashion

In [18]:
fashion_db = DBSCAN(X=fashion_sub, eps=800, min_pts=5)
fashion_db.fit()
fashion_db.evaluateConfusion(y_sub)
print(Counter(fashion_db.lable))
print('Gini Index: {0}'.format(fashion_db.computeGini()))
print('Purity: {0}'.format(fashion_db.computePurity()))

Counter({-1: 5323, 2: 332, 1: 139, 3: 98, 7: 25, 5: 25, 8: 19, 4: 17, 6: 14, 9: 8})
Gini Index: 0.145870468948873
Purity: 0.8788774002954209


In [20]:
fashion_db = DBSCAN(X=fashion_sub, eps=700, min_pts=5)
fashion_db.fit()
fashion_db.evaluateConfusion(y_sub)
print(Counter(fashion_db.lable))
print('Gini Index: {0}'.format(fashion_db.computeGini()))
print('Purity: {0}'.format(fashion_db.computePurity()))

Counter({-1: 5744, 1: 109, 4: 70, 3: 50, 2: 9, 5: 8, 6: 5, 7: 5})
Gini Index: 0.10999999999999999
Purity: 0.90625
