## DBSCAN Clustering Algorithm

In this algorithm, clusters are formed by linking nearby points to one another. This is different from KMeans Algorithm which has centroids. DBSCAN makes only a single pass through the data, unlike KMeans.

In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

grades = pd.read_csv("data/grades.csv")
grades.head()

Unnamed: 0,Student,English,Math,Science
0,1,99,96,97
1,2,99,96,97
2,3,98,97,97
3,4,95,100,95
4,5,95,96,96


The 3D plot of the grades of the students looks as shown below:

In [149]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook


fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the values
ax.scatter(grades.iloc[:, 1], grades.iloc[:, 2], grades.iloc[:, 3], c=None, edgecolor='k', alpha=0.6)

ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
ax.set_title('3 Clusters')

plt.show()

<IPython.core.display.Javascript object>

DBSCAN requires specifying two parameters, which are distance threshold, epsilon, and minimum number of samples. As we can see, all the points are put in just one cluster. It seems DBSCAN is not a good algorithm to apply on this kind of data.

In [150]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from collections import Counter

db = DBSCAN(eps=1, min_samples=10)
labels = db.fit(grades)

labels = db.labels_
print(Counter(labels))

labels

Counter({-1: 620})


array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [151]:
def set_colors(labels, colors='rgbykcm'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels

from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

colors = set_colors(labels)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the values
ax.scatter(grades.iloc[:, 1], grades.iloc[:, 2], grades.iloc[:, 3], c=colors, edgecolor='k', alpha=0.6)

ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
ax.set_title('Clusters')

plt.show()

<IPython.core.display.Javascript object>

## DBSCAN on 'faithful' geyser dataset

The 'faithful' dataset has 2 variables, waiting time between eruptions and the duration of the eruption (of the Faithful geyser in Yellowstone National Park, Wyoming,USA). Load the data, which has 272 observations on 2 variables into a dataframe

In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

faithful = pd.read_csv("data/faithful.csv")
faithful.head()

Unnamed: 0,eruptions,waiting
0,3.6,79
1,1.8,54
2,3.333,74
3,2.283,62
4,4.533,85


### Plot the data

In [153]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

fig = plt.figure()
ax = fig.add_subplot(111)

# Plot the values
ax.scatter(faithful.iloc[:, 0], faithful.iloc[:, 1], c=None, edgecolor='k', alpha=0.6)

ax.set_xlabel('Eruption_Time')
ax.set_ylabel('Waiting_Time')

ax.set_title('Original Data')

plt.show()

<IPython.core.display.Javascript object>

### Applied DBSCAN Algorithm with epsilon 10 and minimum samples 10

In [154]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from collections import Counter

db = DBSCAN(eps=1, min_samples=10)
labels = db.fit(faithful)

labels = db.labels_
print(Counter(labels))

labels

Counter({-1: 161, 2: 51, 0: 48, 1: 12})


array([ 0,  1, -1, -1,  2, -1, -1, -1, -1, -1,  1,  2,  0, -1,  2, -1, -1,
        2, -1,  0, -1, -1,  0, -1, -1,  2, -1,  0,  0,  0, -1,  0, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  2, -1, -1,  2, -1, -1,  2, -1,  0,
       -1,  1, -1,  1,  2, -1, -1,  0,  2, -1,  2, -1,  2, -1, -1,  0,  0,
       -1, -1,  2, -1,  0, -1, -1,  0, -1,  0,  0,  2, -1,  2, -1, -1, -1,
       -1,  0, -1, -1, -1, -1, -1, -1,  0, -1, -1,  2, -1, -1,  2, -1, -1,
       -1,  2,  2, -1,  2, -1, -1,  2, -1, -1, -1,  0, -1,  2, -1, -1, -1,
       -1, -1, -1,  0, -1, -1,  2, -1,  2, -1, -1, -1,  2, -1, -1, -1,  2,
       -1, -1, -1,  0,  2, -1,  2,  0,  0, -1,  2, -1, -1,  1,  0,  0, -1,
        2, -1, -1,  2, -1,  1, -1, -1, -1, -1,  0, -1,  0, -1, -1, -1, -1,
       -1, -1,  0, -1,  2,  2, -1, -1, -1, -1,  1,  0,  2,  2, -1,  0,  2,
       -1,  2, -1,  2, -1,  0,  2,  0,  2, -1,  0, -1,  0, -1,  2, -1, -1,
        0, -1,  0,  2, -1,  2, -1, -1, -1, -1, -1,  0, -1, -1, -1,  0, -1,
        2,  1, -1,  0,  0

### Plot of the clusters

In [155]:
def set_colors(labels, colors='rgbykcm'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels

from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

colors = set_colors(labels)

fig = plt.figure()
ax = fig.add_subplot(111)

# Plot the values
ax.scatter(faithful.iloc[:, 0], faithful.iloc[:, 1], c=colors, edgecolor='k', alpha=0.6)

ax.set_xlabel('Eruption_Time')
ax.set_ylabel('Waiting_Time')

ax.set_title('Clusters')

plt.show()

<IPython.core.display.Javascript object>

### As the clusters obtained are not that good, I tried to remove some observations which have 'eruptions' value between 2.2 and 4.0 and see if the density based clustering works better. 

In [156]:
faithful_2 = faithful[(faithful['eruptions'] < 2.2) | (faithful['eruptions'] > 4.0)]
faithful_2.head()

Unnamed: 0,eruptions,waiting
1,1.8,54
4,4.533,85
6,4.7,88
8,1.95,51
9,4.35,85


### Plot after removing the observations

In [157]:
%matplotlib notebook
%matplotlib notebook

fig = plt.figure()
ax = fig.add_subplot(111)

# Plot the values
ax.scatter(faithful_2.iloc[:, 0], faithful_2.iloc[:, 1], c=None, edgecolor='k', alpha=0.6)

ax.set_xlabel('Eruption_Time')
ax.set_ylabel('Waiting_Time')

ax.set_title('Original Data')

plt.show()

<IPython.core.display.Javascript object>

### DBSCAN Algorithm with epsilon 1 and minimum samples 10

In [158]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from collections import Counter

db = DBSCAN(eps=1, min_samples=10)
labels = db.fit(faithful_2)

labels = db.labels_
print(Counter(labels))

labels

Counter({-1: 139, 1: 41, 0: 23})


array([-1, -1, -1, -1, -1, -1,  0, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1,  0, -1,  0, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,
        1, -1, -1, -1, -1, -1, -1,  1, -1,  0,  1,  1, -1,  1, -1, -1,  0,
        0, -1, -1,  1, -1, -1, -1, -1, -1,  0, -1,  1, -1, -1, -1, -1, -1,
       -1, -1,  0, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1,
       -1, -1, -1,  1, -1, -1, -1, -1,  0, -1, -1, -1,  1, -1, -1,  1, -1,
       -1,  1, -1, -1, -1,  1,  1,  0, -1, -1,  1, -1, -1, -1,  0,  1,  1,
       -1, -1, -1, -1,  0, -1, -1, -1, -1, -1,  0,  1,  1, -1, -1, -1,  0,
        1, -1,  0,  1, -1,  1, -1,  1, -1, -1,  1,  1,  0,  0, -1,  1, -1,
       -1,  0, -1,  0, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,
        0,  0, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1,  1, -1, -1,  1,
       -1, -1,  1, -1, -1,  0,  1, -1,  1, -1, -1,  1, -1, -1, -1, -1], dtype=int64)

### As observed below, the clustering algorithm didn't work well

In [159]:
def set_colors(labels, colors='rgbykcm'):
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels

from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
%matplotlib notebook

colors = set_colors(labels)

fig = plt.figure()
ax = fig.add_subplot(111)

# Plot the values
ax.scatter(faithful_2.iloc[:, 0], faithful_2.iloc[:, 1], c=colors, edgecolor='k', alpha=0.6)

ax.set_xlabel('Eruption_Time')
ax.set_ylabel('Waiting_Time')

ax.set_title('Clusters')

plt.show()

<IPython.core.display.Javascript object>

### References

https://github.com/mlnjsh/DSBDA-Btech/blob/master/grades_km_input.csv<br>
Data Science and Big Data Analytics by Wiley Publications<br>
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html#sphx-glr-auto-examples-cluster-plot-cluster-iris-py<br>
http://marcharper.codes/2016-07-11/Clustering+with+Scikit-Learn.html<br>
http://www.biostat.jhsph.edu/~rpeng/useRbook/faithful.csv<br
http://mccormickml.com/2016/11/08/dbscan-clustering/