In [1]:
from mpl_toolkits.basemap import Basemap

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan
import time

from matplotlib import pyplot as plt
%matplotlib inline

import random

In [2]:
# Set the sample fraction
sample_frac = 0.001  # 0.1% of the data

# Define the skip_rows function
def skip_rows(index):
    return index > 0 and random.random() > sample_frac

# Use the skip_rows function with skiprows parameter
df = pd.read_csv('data/nyc_taxi_data_2014.csv', skiprows=skip_rows)

In [3]:
len(df)

15007

In [4]:
df.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount',
       'tolls_amount', 'total_amount'],
      dtype='object')

In [5]:
# Drop NaN values
o_lng = df[['pickup_longitude']].rename(columns={"pickup_longitude": "lng"}).dropna()
o_lat = df[['pickup_latitude']].rename(columns={"pickup_latitude": "lat"}).dropna()
d_lng = df[['dropoff_longitude']].rename(columns={"dropoff_longitude": "lng"}).dropna()
d_lat = df[['dropoff_latitude']].rename(columns={"dropoff_latitude": "lat"}).dropna()
lng = pd.concat([o_lng, d_lng], axis=0)
lat = pd.concat([o_lat, d_lat], axis=0)

In [6]:
lng_lat = pd.concat([lng, lat], axis=1).reset_index(drop=True)
lng_lat.tail()

Unnamed: 0,lng,lat
30009,-73.982427,40.722432
30010,-73.971565,40.686415
30011,-73.971752,40.762462
30012,-74.008718,40.732675
30013,-73.977755,40.76647


In [7]:
points = lng_lat.values

In [8]:
points.shape

(30014, 2)

In [9]:
coords = np.radians(points)

In [10]:
len(coords)

30014

In [11]:
min_samples=5 # minimum number of samples needed to form a neighbourhood
eps=1.0 # neighbourhood search radius
metric='haversine' # distance measure 

dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(coords)

In [12]:
lng_lat

Unnamed: 0,lng,lat
0,-73.966515,40.760742
1,-73.985742,40.752582
2,-74.002211,40.726504
3,-73.980141,40.780450
4,-73.975904,40.755294
...,...,...
30009,-73.982427,40.722432
30010,-73.971565,40.686415
30011,-73.971752,40.762462
30012,-74.008718,40.732675


In [13]:
lng_lat['Cluster'] = dbscan.fit_predict(coords)  # Assign the cluster labels

# Display the size of each cluster
lng_lat['Cluster'].value_counts()

Cluster
0    29579
1      435
Name: count, dtype: int64

In [14]:
len(points)

30014

In [15]:
# points = df[['lng', 'lat']].to_numpy()
cluster_labels = lng_lat['Cluster'].to_numpy()
# cluster_labels
np.unique(cluster_labels)

array([0, 1], dtype=int64)

In [42]:
cluster_labels

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
lng_lat

Unnamed: 0,lng,lat,Cluster
0,-73.966515,40.760742,0
1,-73.985742,40.752582,0
2,-74.002211,40.726504,0
3,-73.980141,40.780450,0
4,-73.975904,40.755294,0
...,...,...,...
30009,-73.982427,40.722432,0
30010,-73.971565,40.686415,0
30011,-73.971752,40.762462,0
30012,-74.008718,40.732675,0


In [17]:
min_samples=None
min_cluster_size=3
hdb = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size, metric='haversine')  

In [18]:
lng_lat['Cluster_hdb'] = hdb.fit_predict(coords)  # Assign the cluster labels

# Display the size of each cluster
lng_lat['Cluster_hdb'].value_counts()



Cluster_hdb
-1       8592
 0        435
 1433     120
 1815      84
 59        81
         ... 
 1858       3
 93         3
 890        3
 410        3
 2217       3
Name: count, Length: 2358, dtype: int64

In [20]:
cluster_labels2 = lng_lat['Cluster_hdb'].to_numpy()
cluster_labels2

array([1730, 1874,  913, ..., 2175,   -1, 1107], dtype=int64)

In [None]:
# TODO use pyclustering to show each cluster on its own map of nyc

In [None]:
# NYC taxi route data not meant to be clustered the points are too close in proximity