In [2]:
# Load libraries 
from sklearn import datasets 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

### Load Iris Flower Dataset

In [6]:
# Load data 
iris = datasets.load_iris()
type(iris)

sklearn.utils.Bunch

In [8]:
# Get the data features
X = iris.data
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

### Standardize Features

In [9]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Check out the data features after standardization 
X_std[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

### Conduct DBSCAN Clustering 

`DBSCAN` has three main parameters to set:

* `eps`: The maximum distance from an observation for another observation to be considered its neighbor.

* `min_samples`: The minimum number of observation less than eps distance from an observation for to be considered a core observation.
* `metric`: The distance metric used by eps. For example, minkowski, euclidean, etc. (note that if Minkowski distance is used, the parameter p can be used to set the power of the Minkowski metric)

If we look at the clusters in our training data we can see two clusters have been identified, 0 and 1, while outlier observations are labeled -1.

In [10]:
# Create meanshift object
clt = DBSCAN(n_jobs=-1) # The number of parallel jobs to run.
clt


DBSCAN(n_jobs=-1)

In [11]:
# Fit the DBSCAN model to the data
DBSCAN_model = clt.fit(X_std)
DBSCAN_model

DBSCAN(n_jobs=-1)

### References: 

* [DBSCAN Clustering](https://chrisalbon.com/code/machine_learning/clustering/dbscan_clustering/)