# DBScan and Outlier Detection

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import data_gen

In [None]:
blob_data = data_gen.four_blobs()
uneven_blobs = data_gen.four_blobs(100, 100, 400, 400)
mouse_data = data_gen.mouse_shape()
moons_data = data_gen.two_moons()
circle_data = data_gen.circle()
noise_data = data_gen.noise()
plot_data = [blob_data, uneven_blobs, mouse_data, moons_data, circle_data, noise_data]

fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(16, 9))
for ax, data in zip(axes.flatten(), plot_data):
    ax.scatter(data[:, 0], data[:, 1], color="#266662", marker=".")
fig.savefig("cluster_plots.png", dpi=200, bbox_inches="tight")

## DBScan for Blob Data

In [None]:
from sklearn.cluster import DBSCAN

It can be difficult to get the DBScan parameters right. For medium sized data sets it helps to scan the parameter space and evaluate the number of clusters found and the ratio of outliers in the data.

In [None]:
number_of_clusters = list()
ratio_of_outliers = list()
test_epsilon = np.linspace(1, 50, 50) / 100.
for eps in test_epsilon:
    # We'll keep min_samples constant and vary only eps
    dbscan = DBSCAN(eps=eps, min_samples=5)
    assignment = dbscan.fit(blob_data)
    # dbscan.labels returns an array assigning each data point a cluster.
    # Outliers are given the label -1
    # counting all unique labels to obtain the number of clusters found
    n_clusters = np.unique(dbscan.labels_)
    number_of_clusters += [len(n_clusters[n_clusters >= 0])]
    # counting the all data points labelled -1 as outliers
    # and dividing the total number of data points
    ratio_of_outliers += [len(dbscan.labels_[dbscan.labels_ < 0]) / len(dbscan.labels_)]


In [None]:
fig, ax1 = plt.subplots()
ax1.plot(test_epsilon, number_of_clusters,
         color="#266662", label="Clusters")
ax1.set_xlabel("$\epsilon$-Parameter")
ax1.set_xlim(0.01, 0.3)
ax1.set_ylabel("Number of Clusters found", color="#266662")

ax2 = ax1.twinx()
ax2.plot(test_epsilon, ratio_of_outliers,
         color="#9E5E9B", linestyle="--", label="Outliers")
ax2.set_ylim(0, 0.01)
ax2.set_ylabel("Ratio of Outliers", color="#9E5E9B")
fig.savefig("eps_adjustment.png", dpi=200, bbox_inches="tight")
plt.show()

The plateau around eps=0.1 looks most promising. In order to visualize the progress, the cluster result for two other eps values are plotted as well

In [None]:
from matplotlib.colors import ListedColormap
eps_list = [0.03, 0.1, 0.2]
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(8,2))
cmap = ListedColormap(["#266662", "#9E5E9B", "#ED5654", "#B68E15", "#00B0F0"])

for eps, ax in zip(eps_list, axes):
    dbscan = DBSCAN(eps=eps, min_samples=5)
    assignment = dbscan.fit_predict(blob_data)
    core_sample_labels = dbscan.labels_[dbscan.labels_ >= 0]
    core_samples = blob_data[dbscan.labels_ >= 0]
    outliers = blob_data[dbscan.labels_ < 0]
    ax.scatter(core_samples[:, 0], core_samples[:, 1], c=core_sample_labels,
               cmap=cmap, marker=".", alpha=0.5)
    ax.scatter(outliers[:, 0], outliers[:, 1], c="grey",
               marker=".")

fig.savefig("DBScan_eps.png", dpi=200, bbox_inches="tight")
plt.show()

## Performance of DBScan on all Data Sets

In [None]:
from matplotlib.colors import ListedColormap
cmap = ListedColormap(["#266662", "#9E5E9B", "#ED5654", "#B68E15"])
fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(16, 9))
min_samples=2
eps_list = [1, 1, 1, 1, 1, 1]

for data, eps, ax in zip(plot_data, eps_list, axes.flatten()):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    assignment = dbscan.fit_predict(data)
    core_sample_labels = dbscan.labels_[dbscan.labels_ >= 0]
    core_samples = data[dbscan.labels_ >= 0]
    outliers = data[dbscan.labels_ < 0]
    ax.scatter(core_samples[:, 0], core_samples[:, 1], c=core_sample_labels,
               cmap=cmap, marker="o", alpha=0.5)
    ax.scatter(outliers[:, 0], outliers[:, 1], c="grey",
               marker="x")

fig.savefig("DBScan_all_data.png", dpi=200, bbox_inches="tight")
plt.show()