# Implementation of Density Based Spatial Clustering of Applications with Noise

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

In [5]:
class DB_Scan():
  def __init__(self,radius= 0.5, n_minpts=5):
    self.radius = radius
    self.n_minpts = n_minpts

  def euclidean_distance(self,x1,x2):
    return np.linalg.norm(x1-x2)

  def all_distances(self,x):
    # find the distance between all the points in the sample through an nxn matrix
    n_samples = x.shape[0]
    distance = np.zeros((n_samples, n_samples))
    for idx, each_x in enumerate(x):
      for idy, other_x in enumerate(x):
          distance[idx, idy] = self.euclidean_distance(each_x, other_x)
    return distance

  def get_neighbour(self,distance, idx):
    return np.argwhere(distance[idx] <= self.radius).flatten()

  def grow_cluster(self,idx, neighbours):
    cluster = [idx]
    for each_idx in neighbours[idx]:
      if each_idx not in self.visited_sample:
        self.visited_sample.append(each_idx)
        # find if it is a core point or not
        if len(neighbours[each_idx])>= self.n_minpts:
          expanded_cluster = self.grow_cluster(each_idx,neighbours)
          cluster = cluster + expanded_cluster
        else:
          cluster.append(each_idx)


    return cluster

  def get_cluster_labels(self,cluster,x):
    labels = np.full(x.shape[0],-1) # labels that are not present with in self.cluster are by default assigned as a noise label
    for idx,each_cluster in enumerate(cluster):
      for id in each_cluster:
        labels[id] = idx
    return labels




  def predict(self,x):
    self.n_cluster  = 0
    self.clusters  = [] # the shape of it depends on the no of clusters and it stores the idx of the rows belonging to that cluster
    self.visited_sample  = [] # stores the idx of all the data samples that has been visited
    self.labels = [] # cluster labels of each sample of data

    # First the distance between all the points
    dist_all_points = self.all_distances(x)

    # get the neighbors and if len(neighbours> minpts, assign them as core)

    nb_array = [self.get_neighbour(dist_all_points,idx) for idx,each_x in enumerate(x)]

    for idx,each_x in enumerate(x):
      if idx not in self.visited_sample:
        # check it it can be a core point
        if(len(nb_array[idx])>= self.n_minpts):
          self.visited_sample.append(idx)
          # grow the cluster
          new_cluster = self.grow_cluster(idx,nb_array)
          self.clusters.append(new_cluster)


    predictions = self.get_cluster_labels(self.clusters,x)
    return predictions













In [7]:
iris = datasets.load_iris()
x,y = iris.data,iris.target
model = DB_Scan()
predictions = model.predict(x)
print("accuracy",accuracy_score(predictions,y))





accuracy 0.62


In [8]:
from sklearn.cluster import DBSCAN


sk_model = DBSCAN(eps =0.5,min_samples=5)
sk_predictions = sk_model.fit_predict(x)
print("accuracy",accuracy_score(sk_predictions,y))



accuracy 0.62
