In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans

# Bokeh
#!pip install gmplot
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap

In [2]:
url = "https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-apr14.csv"
df = pd.read_csv(url)

In [3]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB
None


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [4]:
X = df[['Lat','Lon']].sample(50000)

In [5]:
kmeans = MiniBatchKMeans(n_clusters=6)
kmeans.fit(X)
kmeans.cluster_centers_
kmeans.labels_

array([5, 5, 1, ..., 5, 5, 5])

In [6]:
output_file("gmap.html")

map_options = GMapOptions(lat=40.7128, lng=-74.0060, map_type="roadmap", zoom=10)
p = gmap("AIzaSyB-E81VWnuGE2A9iXgHHI3lz5ZOvbRIN3A", map_options, title="New York")

centroid = ColumnDataSource(data=dict(lat=kmeans.cluster_centers_[:,0],
                                      lon=kmeans.cluster_centers_[:,1]))

cluster_1 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 0].iloc[:,0],
                                             lon=X[kmeans.labels_== 0].iloc[:,1]))
cluster_2 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 1].iloc[:,0],
                                             lon=X[kmeans.labels_== 1].iloc[:,1]))
cluster_3 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 2].iloc[:,0],
                                             lon=X[kmeans.labels_== 2].iloc[:,1]))
cluster_4 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 3].iloc[:,0],
                                             lon=X[kmeans.labels_== 3].iloc[:,1]))
cluster_5 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 4].iloc[:,0],
                                             lon=X[kmeans.labels_== 4].iloc[:,1]))
cluster_6 = ColumnDataSource(data=dict(lat=X[kmeans.labels_== 5].iloc[:,0],
                                             lon=X[kmeans.labels_== 5].iloc[:,1]))

p.circle(x="lon", y="lat", size=8, fill_color="red", fill_alpha=0.8, source=cluster_1)
p.circle(x="lon", y="lat", size=8, fill_color="green", fill_alpha=0.8, source=cluster_2)
p.circle(x="lon", y="lat", size=8, fill_color="yellow", fill_alpha=0.8, source=cluster_3)
p.circle(x="lon", y="lat", size=8, fill_color="blue", fill_alpha=0.8, source=cluster_4)
p.circle(x="lon", y="lat", size=8, fill_color="magenta", fill_alpha=0.8, source=cluster_5)
p.circle(x="lon", y="lat", size=8, fill_color="white", fill_alpha=0.8, source=cluster_6)
p.circle(x="lon", y="lat", size=15, fill_color="black", fill_alpha=0.8, source=centroid)

show(p)

### Date / Time:

In [7]:
uber = df.sample(10000)
uber.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
277515,4/17/2014 11:34:00,40.7408,-73.9944,B02617
207921,4/30/2014 10:02:00,40.7628,-73.9775,B02598
203628,4/29/2014 19:53:00,40.7427,-73.9826,B02598
145978,4/22/2014 8:01:00,40.7648,-73.9754,B02598
296334,4/23/2014 18:22:00,40.7248,-73.9995,B02617


In [8]:
uber['Date/Time'] = pd.to_datetime(uber['Date/Time'])
uber['weekday'] = uber['Date/Time'].dt.dayofweek

In [9]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 277515 to 488668
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  10000 non-null  datetime64[ns]
 1   Lat        10000 non-null  float64       
 2   Lon        10000 non-null  float64       
 3   Base       10000 non-null  object        
 4   weekday    10000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 468.8+ KB


In [10]:
uber.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,weekday
277515,2014-04-17 11:34:00,40.7408,-73.9944,B02617,3
207921,2014-04-30 10:02:00,40.7628,-73.9775,B02598,2
203628,2014-04-29 19:53:00,40.7427,-73.9826,B02598,1
145978,2014-04-22 08:01:00,40.7648,-73.9754,B02598,1
296334,2014-04-23 18:22:00,40.7248,-73.9995,B02617,2


In [11]:
pd.to_numeric(uber['Date/Time'])

277515    1397734440000000000
207921    1398852120000000000
203628    1398801180000000000
145978    1398153660000000000
296334    1398277320000000000
                 ...         
290492    1398113760000000000
233800    1396638720000000000
20784     1397822760000000000
439093    1397591040000000000
488668    1398255420000000000
Name: Date/Time, Length: 10000, dtype: int64