# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [20]:
!pip install folium



In [21]:
import pandas as pd
import numpy as np
import folium


In [22]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [23]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [24]:
train.shape

(400000, 32)

In [25]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [26]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [27]:
from sklearn.cluster import KMeans

In [28]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores

In [29]:
#train model
myKMeans.fit(coordinates.to_numpy())



KMeans(n_clusters=20, n_jobs=-1)

In [30]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [31]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [32]:
cluster_map

In [33]:
labels

array([14, 18, 16, ...,  9,  9,  5])

In [34]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [35]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [36]:
clusters['fare_amount'].count()

clusterID
0     34966
1     43257
2      4866
3      3114
4       104
5     39796
6     29943
7      4270
8      8103
9     55999
10     1670
11     7289
12     8319
13     8091
14     3258
15       53
16    78219
17      721
18    33763
19    34199
Name: fare_amount, dtype: int64

In [37]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,200681.605617,205029.303409,205029.303409,8.356338,-73.99909,40.722066,-74.000131,40.722123,1.678545,15.680032,...,-74.000127,0.0,0.0,2.9e-05,0.0,0.0,0.0,1.20127,0.923411,0.953498
1,199728.208498,204055.506438,204055.506438,9.467129,-73.96067,40.776014,-73.975967,40.759935,1.650138,15.681231,...,-73.975964,0.0,0.0,0.0,0.0,0.0,0.0,1.591147,0.0,0.050581
2,201497.827374,205863.448418,205863.448418,48.332156,-73.784832,40.646423,-73.969534,40.739151,1.802507,15.741266,...,-73.969542,0.976572,0.0,0.0,0.001028,0.0,0.01829,12.039833,0.0,0.173654
3,199575.478484,203899.763969,203899.763969,49.329165,-73.974322,40.753238,-73.78497,40.654316,1.754656,15.935132,...,-73.784924,0.0,0.838793,0.0,0.0,0.052665,0.0,12.335283,0.234425,0.0
4,201977.5,206354.346154,206354.346154,13.833846,-73.166527,41.3419,-73.157228,41.349861,1.365385,16.211538,...,-73.156865,0.0,0.0,0.0,0.0,0.0,0.0,0.791798,0.0,0.0
5,199914.413836,204245.567846,204245.567846,10.5278,-73.980048,40.756114,-73.958257,40.779634,1.67258,15.644613,...,-73.958261,0.0,0.0,0.0,0.0,0.0,0.0,2.156263,0.129008,0.0
6,200231.702234,204570.031694,204570.031694,14.087176,-73.975266,40.761998,-73.998732,40.724244,1.711485,15.549678,...,-73.998731,0.0,0.0,0.0,0.0,0.0,0.0,2.977981,0.012691,0.921117
7,198318.179391,202614.733021,202614.733021,23.906014,-73.969219,40.774099,-73.919243,40.855333,1.709836,15.605386,...,-73.919244,0.001405,0.0,0.0,0.0,0.029508,0.0,6.432489,0.130445,0.0
8,198843.622856,203152.545724,203152.545724,17.569821,-73.986861,40.715307,-73.980508,40.667373,1.689991,15.644946,...,-73.980506,0.0,0.0,0.0,0.0,0.000864,0.0,3.746389,0.447982,0.004936
9,200819.111502,205169.826961,205169.826961,7.047863,-73.992296,40.740261,-73.992721,40.73923,1.681816,15.719174,...,-73.992722,0.0,0.0,0.0,0.0,0.0,0.0,0.936557,0.605922,0.645887


In [38]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13327470000.0,13906690000.0,13906690000.0,25.412438,0.000128,0.000151,0.000107,0.000118,1.672725,75.648969,...,0.000107,0.0,0.0,2.9e-05,0.0,0.0,0.0,0.58338,0.070725,0.044341
1,13289830000.0,13867380000.0,13867380000.0,22.169685,0.000114,0.000142,0.000125,0.000104,1.701387,75.737412,...,0.000125,0.0,0.0,0.0,0.0,0.0,0.0,1.051463,0.0,0.048024
2,13654050000.0,14247650000.0,14247650000.0,135.972167,0.0001,6.1e-05,0.001137,0.002188,1.904873,76.447123,...,0.001137,0.022884,0.0,0.0,0.001027,0.0,0.017959,3.671206,0.0,0.143528
3,13568470000.0,14158160000.0,14158160000.0,105.601045,0.000969,0.000585,0.000755,0.0007,1.802299,74.153195,...,0.000756,0.0,0.135263,0.0,0.0,0.049908,0.0,3.150818,0.179528,0.0
4,13906920000.0,14510820000.0,14510820000.0,677.622614,0.014143,0.012058,0.007837,0.008929,0.583645,89.915982,...,0.007852,0.0,0.0,0.0,0.0,0.0,0.0,33.84698,0.0,0.0
5,13331640000.0,13910970000.0,13910970000.0,23.794778,0.000112,0.000136,0.000138,0.000166,1.698347,74.954913,...,0.000138,0.0,0.0,0.0,0.0,0.0,0.0,1.560467,0.112368,0.0
6,13283770000.0,13861090000.0,13861090000.0,32.322294,0.000139,0.000132,0.000112,0.000178,1.743855,74.9161,...,0.000112,0.0,0.0,0.0,0.0,0.0,0.0,1.376827,0.01253,0.072663
7,13489580000.0,14075940000.0,14075940000.0,257.344817,0.001041,0.001743,0.002681,0.001784,1.825132,76.553309,...,0.002683,0.001404,0.0,0.0,0.0,0.028644,0.0,17.686796,0.113456,0.0
8,13198910000.0,13772620000.0,13772620000.0,140.648329,0.000299,0.001005,0.000434,0.000631,1.683937,74.207049,...,0.000434,0.0,0.0,0.0,0.0,0.000863,0.0,7.01075,0.247325,0.004913
9,13231420000.0,13806490000.0,13806490000.0,12.56535,8e-05,7.4e-05,7.8e-05,7.7e-05,1.705321,75.320078,...,7.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.202456,0.238785,0.228721
