# Density Based Spatial Clustering of Applications with Noise (DB-SCAN)

### Goal of this Notebook
Clustering algorithm based on Scikit-learn for nonlinear data. Deprecated for Trajectories, here for reference or later use.
***
**Outputs:**
- None (for now).

**Inputs:**
- _trajectories.csv_ from raw trajectories data. Available on Dropbox in `/Private Structured data collection/Data processing/Auxiliary files/Demand/Flow_speed/Trajectories`.
- _trajectories_clusterd.csv_ from raw trajectories data. Available on Dropbox in `/Private Structured data collection/Data processing/Auxiliary files/Demand/Flow_speed/Trajectories`.
- _InternalCentroidZones.shp_ Shapefile available on Dropbox in `/Private Structured data collection/Data processing/Raw/Demand/OD demand/TAZ`
- _ExternalCentroidZones.shp_ Shapefile available on Dropbox in `/Private Structured data collection/Data processing/Raw/Demand/OD demand/TAZ`

**Temporary Files Within the Pipeline:** 
- No temporary files.

**Dependent Scripts:**
- No script dependencies.

**Dependent Libraries:**
- numpy
- pandas
- os
- csv
- json
- matplotlib
- shapely
- keplergl
- geopandas
- rtree
***
**Sections:**
- A. [Clustering](#section_ID_b)

In [5]:
import os
import sys
import csv
import json
import rtree
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn import metrics
from keplergl import KeplerGl
from sklearn.cluster import DBSCAN
from shapely.geometry import Point, LineString, MultiPoint
from trajectories_utils import parseTrajectories, clusterByZone, trajectoriesFromZones, showTrajectoriesFromZones


In [2]:
# We let this notebook to know where to look for fremontdropbox module
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fremontdropbox import get_dropbox_location

dropbox_dir = get_dropbox_location()

rootdir = dropbox_dir + "/Private Structured data collection/Data processing/Raw/Demand/Flow_speed/Here data"
print(rootdir)

C:\Users\jainc\Fremont Dropbox\Theophile Cabannes/Private Structured data collection/Data processing/Raw/Demand/Flow_speed/Here data


<a id="section_ID_b"></a>
## A. DB-SCAN Clustering

In [3]:
class DensityCluster:  
    
    def __init__(self, data_input):
        self.df = pd.read_csv(data_input)
        self.maxes = []
        self.rep_points = []
        self.clusters = []
        self.numClusters = 0
        
    def createClusters(self, tuning_parameter=0.1):
        # represent GPS points as (lat, lon)
        coords = self.df.as_matrix(columns=['Origin Y', 'Origin X'])
        # earth's radius in km
        kms_per_radian = 6371.0088
        # define epsilon as 0.1 kilometers, converted to radians for use by haversine
        epsilon = tuning_parameter / kms_per_radian

        # eps is the max distance that points can be from each other to be considered in a cluster
        # min_samples is the minimum cluster size (everything else is classified as noise)
        db = DBSCAN(eps=epsilon, min_samples=100, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
        cluster_labels = db.labels_
        # get the number of clusters (ignore noisy samples which are given the label -1)
        self.numClusters = len(set(cluster_labels) - set([-1]))

        print ('Clustered ' + str(len(self.df)) + ' points to ' + str(self.numClusters) + ' clusters')

        # turn the clusters in to a pandas series
        self.clusters = pd.Series([coords[cluster_labels == n] for n in range(self.numClusters)])

    def getCentermostPoint(self, cluster):
        centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
        centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
        return tuple(centermost_point)
    
    def showCentroidClusters(self):
        # get the centroid point for each cluster
        centermost_points = self.clusters.map(self.getCentermostPoint)
        lats, lons = zip(*centermost_points)

        self.rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
        fig, ax = plt.subplots(figsize=[15, 10])

        rs_scatter = ax.scatter(self.rep_points['lon'][0], self.rep_points['lat'][0], c='#99cc99', edgecolor='None', alpha=0.7, s=self.maxes[0]/10)

        for i in range(1, self.numClusters):
            ax.scatter(self.rep_points['lon'][i], self.rep_points['lat'][i], c='#99cc99', edgecolor='None', alpha=0.7, s=self.maxes[i]*2)

        df_scatter = ax.scatter(self.df['Origin X'], self.df['Origin Y'], c='k', alpha=0.9, s=3)

        ax.set_title('Full GPS trace vs. DBSCAN clusters')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')
        ax.legend([df_scatter, rs_scatter], ['GPS Points', 'Cluster Centers'], loc='upper right')

        labels = ['Cluster {0}'.format(i) for i in range(1, self.numClusters + 1)]
        for label, x, y in zip(labels, self.rep_points['lon'], self.rep_points['lat']):
            plt.annotate(
                label, 
                xy = (x, y), xytext = (-25, -30),
                textcoords = 'offset points', ha = 'right', va = 'bottom',
                bbox = dict(boxstyle = 'round,pad=0.5', fc = 'white', alpha = 0.5),
                arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

        plt.show()
        
    def groupByTime(self, row):
        t = self.df[(self.df['Origin Y']==row[0]) & (self.df['Origin X']==row[1])]['Time'].iloc[0]
        return t[ (t[:t.index(':')].index(" ")):][0:3]
    
    def showTimeClusters(self):
        M = []
        for i in range(self.numClusters):
            hours = np.apply_along_axis(self.groupByTime, 1, self.clusters[i]).tolist()
            M.append(list(map(int, hours)))
 
        f, axarr = plt.subplots(len(M), figsize=(12.5,50))
        for i in range(len(M)):
            y, x, _ = axarr[i].hist(list(M[i]))
            self.maxes.append(y.max())
            axarr[i].set_title("Cluster {0}".format(i + 1))
            axarr[i].set_xlabel("Hour")
            axarr[i].set_ylabel("Trajectories")
        f.tight_layout(pad=1.0)
        
    def gmapPlot(self):
        gmap = gmplot.GoogleMapPlotter(self.df["Origin Y"][0], self.df["Origin X"][0], 11)
        gmap.plot(self.df["Origin Y"], self.df["Origin X"], "cornflowerblue", edge_width=1)
        gmap.draw("trajectories_map.html")
        print("Plotted trajectories.")

        gmap = gmplot.GoogleMapPlotter(self.df["Origin Y"][0], self.df["Origin X"][0], 11)
        gmap.plot(self.df["Origin Y"], self.df["Origin X"], "cornflowerblue", edge_width=1)
        gmap.heatmap(self.rep_points['lat'], self.rep_points['lon'], radius=20)
        gmap.draw("trajectories_map_with_clusters.html")
        print("Plotted trajectories with clusters.")

        gmap = gmplot.GoogleMapPlotter(self.df["Origin Y"][0], self.df["Origin X"][0], 11)
        gmap.heatmap(self.rep_points['lat'], self.rep_points['lon'], radius=20)
        gmap.draw("map_with_clusters.html")
        print("Plotted clusters.")
        
    def runAll(self):
        self.createClusters()
        self.showTimeClusters()
        self.showCentroidClusters()


In [None]:
dc = DensityCluster("trajectories.csv")
dc.runAll()

  if sys.path[0] == '':


Clustered 50201 points to 22 clusters
