# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [15]:
!pip install folium



In [16]:
import pandas as pd
import numpy as np
import folium


In [17]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [18]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [19]:
train.shape

(400000, 32)

In [20]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [21]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [22]:
from sklearn.cluster import KMeans

In [23]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores

In [24]:
#train model
myKMeans.fit(coordinates.to_numpy())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=20, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [25]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [26]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [27]:
cluster_map

In [28]:
labels

array([ 1,  3,  8, ..., 17, 17,  0])

In [29]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [30]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [31]:
clusters['fare_amount'].count()

clusterID
0     42411
1      3260
2        98
3     40500
4       721
5      8377
6      4787
7      8132
8     84043
9     37003
10     7324
11    36884
12       53
13     4631
14     8685
15     3112
16    47695
17    60487
18       92
19     1705
Name: fare_amount, dtype: int64

In [32]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,199774.782391,204102.919832,204102.919832,10.473257,-73.980712,40.755512,-73.959396,40.778965,1.673057,15.641933,...,-73.959399,0.0,0.0,0.0,0.0,0.0,0.0,2.135803,0.143383,0.0
1,203392.495706,207799.057975,207799.057975,12.689316,-73.884595,40.762215,-73.880406,40.755495,1.711043,15.728221,...,-73.880396,0.0,0.00184,0.0,0.0,0.351227,0.122393,1.893311,0.0,0.0
2,208037.27551,212544.540816,212544.540816,9.795102,-73.149918,41.366595,-73.147641,41.368939,1.387755,16.173469,...,-73.147255,0.0,0.0,0.0,0.0,0.0,0.0,0.242551,0.0,0.0
3,200202.23242,204539.768815,204539.768815,11.78685,-73.997734,40.727137,-73.979767,40.757185,1.700543,15.738346,...,-73.979772,0.0,0.0,0.0,0.0,0.0,0.0,2.387517,0.93316,0.055086
4,199603.178918,203928.009709,203928.009709,62.839459,-74.019105,40.746888,-74.172807,40.69932,1.809986,15.682386,...,-74.172789,0.0,0.0,0.037448,0.751734,0.015257,0.0,8.939659,0.257975,0.0
5,200962.147905,205316.085353,205316.085353,28.343162,-73.98026,40.752659,-73.874397,40.763309,1.686403,15.706578,...,-73.8744,0.0,0.0,0.0,0.0,0.0,0.581473,5.892355,0.247463,0.0
6,201548.479215,205915.184249,205915.184249,48.654126,-73.784774,40.646475,-73.9711,40.739403,1.800292,15.726551,...,-73.971108,0.975977,0.0,0.0,0.001044,0.0,0.011907,12.108439,0.0,0.17652
7,198017.328087,202307.842966,202307.842966,15.025334,-73.97945,40.728392,-73.94553,40.712366,1.702164,15.670561,...,-73.945526,0.0,0.0,0.0,0.0,0.0,0.0,2.745345,0.45573,0.009961
8,199808.575087,204137.652273,204137.652273,7.146033,-73.984377,40.75452,-73.984193,40.754805,1.684352,15.710624,...,-73.984193,0.0,0.0,0.0,0.0,0.0,0.0,0.853939,0.076413,0.086527
9,200432.961976,204775.514391,204775.514391,12.751458,-73.978139,40.758658,-73.998128,40.725883,1.710672,15.589601,...,-73.998125,0.0,0.0,0.0,0.0,0.0,0.0,2.59874,0.037862,0.931951


In [33]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13329420000.0,13908660000.0,13908660000.0,23.791135,0.00011,0.000146,0.000141,0.000159,1.693671,74.93842,...,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,1.599613,0.122827,0.0
1,13473460000.0,14059090000.0,14059090000.0,193.236821,0.000826,0.000577,0.001425,0.001005,1.822584,76.690151,...,0.001424,0.0,0.001838,0.0,0.0,0.227937,0.107446,3.613266,0.0,0.0
2,13682800000.0,14276860000.0,14276860000.0,54.881506,0.005296,0.000353,0.004019,0.000394,0.610983,91.217021,...,0.004026,0.0,0.0,0.0,0.0,0.0,0.0,2.412272,0.0,0.0
3,13419330000.0,14002590000.0,14002590000.0,26.462411,0.000102,0.000149,0.000119,0.000117,1.690861,75.889139,...,0.000119,0.0,0.0,0.0,0.0,0.0,0.0,1.434699,0.062374,0.052053
4,12583310000.0,13129780000.0,13129780000.0,611.575722,0.005625,0.001592,0.000649,0.00209,1.845789,74.642037,...,0.00065,0.0,0.0,0.036096,0.186889,0.015045,0.0,19.669532,0.19169,0.0
5,13460920000.0,14046020000.0,14046020000.0,77.686992,0.000256,0.000486,0.000487,0.000356,1.689492,74.627838,...,0.000487,0.0,0.0,0.0,0.0,0.0,0.243391,2.48536,0.186247,0.0
6,13662490000.0,14256470000.0,14256470000.0,131.234187,0.000129,6.4e-05,0.001003,0.002172,1.902858,76.56562,...,0.001003,0.023451,0.0,0.0,0.001044,0.0,0.011768,3.441175,0.0,0.145391
7,13130060000.0,13700620000.0,13700620000.0,55.843559,0.00041,0.000446,0.000287,0.000304,1.701961,75.694802,...,0.000287,0.0,0.0,0.0,0.0,0.0,0.0,2.345211,0.248071,0.009863
8,13355480000.0,13935930000.0,13935930000.0,14.225081,7.3e-05,7.1e-05,8.2e-05,7.7e-05,1.718718,75.195074,...,8.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.17549,0.070575,0.079041
9,13282510000.0,13859780000.0,13859780000.0,32.945413,0.000131,0.000125,0.000106,0.000161,1.742188,75.101661,...,0.000106,0.0,0.0,0.0,0.0,0.0,0.0,1.514936,0.036429,0.06342
