# DBSCAN Clustering

In [1]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show, curdoc, push_notebook
from bokeh.models import ColumnDataSource,HoverTool,ColorBar
from bokeh.plotting import figure, show, output_notebook
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from bokeh.layouts import layout
from bokeh.layouts import row
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

output_notebook()

data = pd.read_csv('Wholesale customers data.csv')

In [2]:
data = data.drop('Channel',axis=1)
data = data.drop('Region',axis=1)
data.head(10)

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,12669,9656,7561,214,2674,1338
1,7057,9810,9568,1762,3293,1776
2,6353,8808,7684,2405,3516,7844
3,13265,1196,4221,6404,507,1788
4,22615,5410,7198,3915,1777,5185
5,9413,8259,5126,666,1795,1451
6,12126,3199,6975,480,3140,545
7,7579,4956,9426,1669,3321,2566
8,5963,3648,6192,425,1716,750
9,6006,11093,18881,1159,7425,2098


### Dataset loaded into dataframe

### Creating the DBSCAN model, bokeh figure and plotting circle glyphs

In [3]:
db = DBSCAN(eps=5000, min_samples=40)
db.fit(data[['Milk','Grocery']])
data['cluster'] = db.labels_

colormap = {-1: 'black', 0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'orange', 5: 'black', 6: 'navy', 7: 'pink'}
colors = [colormap[x] for x in data['cluster']]

p = figure(title = "Wholesale Customers Data", tools="hover,lasso_select,pan,wheel_zoom,box_zoom,reset,save")
p.xaxis.axis_label = 'Milk'
p.yaxis.axis_label = 'Grocery' 

x = data['Milk'].tolist()
y = data['Grocery'].tolist()

r = p.circle(x, y, color=colors, fill_alpha=0.2, size=10)


### Defining method for updating plot based on user interaction

In [4]:
def update(X_Axis,Y_Axis,Radius,Minimum_Samples):
    rd = Radius
    mini = Minimum_Samples
    db_new = DBSCAN(eps=rd,min_samples=mini)
    db_new.fit(data[[X_Axis,Y_Axis]])
    data['cluster_updated'] = db_new.labels_

    colormap_new = {-1: 'black', 0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'orange', 5: 'black', 6: 'navy', 7: 'pink'}
    colors_new = [colormap_new[x] for x in data['cluster_updated']]
    
    x_new = data[X_Axis].tolist()
    y_new = data[Y_Axis].tolist()
        
    r.data_source.data['x'] = data[X_Axis]
    r.data_source.data['y'] = data[Y_Axis]
    p.xaxis.axis_label = X_Axis
    p.yaxis.axis_label = Y_Axis
    r.data_source.data['fill_color'] = colors_new
    r.data_source.data['line_color'] = colors_new
    
    push_notebook()

In [5]:
show(p, notebook_handle=True)

### Please use interactive tools below to modify axes, radius and minimum number of samples

In [6]:
interact(update, X_Axis = list(data.columns)[:-1], Y_Axis = list(data.columns)[:-1], Radius = IntSlider(min=1,max=50000,step=1,value=5000), Minimum_Samples = IntSlider(min=3,max=100,step=1,value=40) )

<function __main__.update>

### The DBSCAN model makes clusters by grouping 'minimum number of samples' data points within the 'radius' of each data point

### Note: Black circles are outliers

### It can be noticed that the radius is inversely proportional to the number of outliers, and the minimum number of samples is directly proportional to the number of outliers.