# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [1]:
!pip install folium



In [2]:
import pandas as pd
import numpy as np
import folium


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [4]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [5]:
train.shape

(400000, 32)

In [6]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [7]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [8]:
from sklearn.cluster import KMeans

In [9]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores

In [10]:
#train model
myKMeans.fit(coordinates.to_numpy())



KMeans(n_clusters=20, n_jobs=-1)

In [11]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [12]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [13]:
cluster_map

In [14]:
labels

array([ 3, 11,  0, ...,  7,  7, 16])

In [15]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [16]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [17]:
clusters['fare_amount'].count()

clusterID
0     79089
1      4786
2     31804
3      3340
4      3132
5        93
6        98
7     58062
8      4588
9      1706
10       53
11    32778
12     8953
13    31896
14     7282
15      726
16    40823
17    13609
18    43160
19    34022
Name: fare_amount, dtype: int64

In [18]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,200552.22086,204897.322468,204897.322468,26.010663,-73.975462,40.751874,-73.882002,40.767314,1.691352,15.700826,...,-73.882003,0.0,0.0,0.0,0.0,0.010952,0.434333,5.504709,0.245285,0.0
1,200458.773555,204801.834897,204801.834897,9.67291,-73.993812,40.737217,-73.981181,40.75598,1.687778,15.72554,...,-73.981184,0.0,0.0,1e-05,0.0,0.0,0.0,1.713453,0.592826,0.135206
2,201515.456836,205881.42689,205881.42689,48.500291,-73.784961,40.646599,-73.970502,40.738716,1.800496,15.722635,...,-73.970511,0.972325,0.0,0.0,0.001033,0.0,0.011979,12.056532,0.0,0.175341
3,197891.955975,202181.006289,202181.006289,19.166352,-73.166309,41.34029,-73.425192,41.147208,1.333333,15.81761,...,-73.424943,0.0,0.006289,0.0,0.0,0.0,0.018868,19.360468,0.0,0.119497
4,199667.202347,203993.140869,203993.140869,9.167035,-73.973294,40.765296,-73.984359,40.751207,1.678865,15.625186,...,-73.984358,0.0,0.0,0.0,0.0,0.0,0.0,1.474181,0.000241,0.231875
5,201051.783862,205407.667147,205407.667147,23.699726,-73.792265,40.673978,-73.784557,40.705389,1.658982,15.633045,...,-73.78454,0.665226,0.273295,0.0,0.0,0.036984,0.01585,3.696589,0.000961,0.0
6,199566.184389,203890.452135,203890.452135,47.711264,-73.971388,40.753379,-73.789272,40.658205,1.759941,16.033579,...,-73.789228,0.0,0.770839,0.0,0.0,0.074227,0.0,11.903882,0.229161,0.0
7,200222.095044,204560.062919,204560.062919,10.79287,-73.992195,40.730875,-73.995089,40.716014,1.695523,15.679004,...,-73.995086,0.0,0.0,0.000333,0.006674,0.000148,0.0,1.843326,0.653663,0.796464
8,199566.444107,203890.245171,203890.245171,8.773758,-73.967559,40.774115,-73.959487,40.784799,1.663186,15.64487,...,-73.959489,0.0,0.0,0.0,0.0,8.6e-05,0.0,1.593378,0.019859,1.2e-05
9,199355.640076,203674.577105,203674.577105,26.491594,-73.875537,40.769293,-73.958234,40.751732,1.718439,15.756866,...,-73.958231,0.0,0.0,0.0,0.000101,0.773363,0.027462,5.012745,0.0,0.145458


In [19]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13390400000.0,13972360000.0,13972360000.0,115.910698,0.000586,0.000509,0.000705,0.001114,1.692889,75.106767,...,0.000704,0.0,0.0,0.0,0.0,0.010833,0.245709,4.720963,0.185136,0.0
1,13378990000.0,13960470000.0,13960470000.0,27.089707,0.000105,0.000183,0.000184,0.000164,1.699955,75.408318,...,0.000184,0.0,0.0,1e-05,0.0,0.0,0.0,1.499568,0.241386,0.116926
2,13656780000.0,14250500000.0,14250500000.0,147.037325,0.000246,0.000104,0.001058,0.002277,1.899459,76.518178,...,0.001058,0.026914,0.0,0.0,0.001032,0.0,0.011837,4.406811,0.0,0.144626
3,14044170000.0,14654240000.0,14654240000.0,1362.926427,0.011092,0.010477,0.146276,0.084458,0.527426,81.605764,...,0.146396,0.0,0.006289,0.0,0.0,0.0,0.018629,800.423628,0.0,0.105883
4,13279940000.0,13857080000.0,13857080000.0,25.631822,0.000146,0.000137,0.000133,0.000179,1.717596,75.29867,...,0.000133,0.0,0.0,0.0,0.0,0.0,0.0,1.231272,0.000241,0.178111
5,13452140000.0,14036700000.0,14036700000.0,576.889542,0.00306,0.002421,0.008261,0.006107,1.706813,74.060378,...,0.008265,0.222807,0.1987,0.0,0.0,0.035633,0.015606,37.004081,0.00096,0.0
6,13587510000.0,14178090000.0,14178090000.0,135.83681,0.001238,0.00061,0.000727,0.000838,1.818312,74.69186,...,0.000728,0.0,0.176698,0.0,0.0,0.068737,0.0,4.753935,0.176698,0.0
7,13290730000.0,13868360000.0,13868360000.0,77.964284,0.000231,0.000382,0.00059,0.000467,1.70711,75.245472,...,0.00059,0.0,0.0,0.000333,0.00663,0.000148,0.0,2.943347,0.22639,0.162111
8,13352700000.0,13933020000.0,13933020000.0,37.620251,0.00021,0.000344,0.000192,0.000413,1.699971,75.28594,...,0.000192,0.0,0.0,0.0,0.0,8.6e-05,0.0,2.084998,0.019465,1.2e-05
9,13309330000.0,13887700000.0,13887700000.0,200.148299,0.000379,0.000403,0.001599,0.001241,1.799086,76.245809,...,0.001599,0.0,0.0,0.0,0.000101,0.17529,0.026711,6.950735,0.0,0.124313
