The notebook used to perform the clustering

In [32]:
# The interactive map requires gmaps to be installed
# https://jupyter-gmaps.readthedocs.io/en/latest/install.html
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import gmaps
import ipywidgets as widgets
from IPython.display import display
widgets.IntSlider()
gmaps.configure(api_key="AIzaSyCzH9RgLeGoCU8uGYf90-fLlG9LZX8X_Bg")

In [33]:
df = pd.read_csv("./data/train.csv")
df['pickup_datetime']  = pd.to_datetime(df['pickup_datetime'])

In [34]:
train = df
train = train.drop(columns=["id", "vendor_id" , "dropoff_datetime", "passenger_count", "dropoff_latitude", 
                         "dropoff_longitude", "store_and_fwd_flag", "trip_duration"])


In [35]:
train['day'] = train['pickup_datetime'].dt.day_name()
train['hour'] = train['pickup_datetime'].dt.hour

train.head(5)

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,day,hour
0,2016-03-14 17:24:55,-73.982155,40.767937,Monday,17
1,2016-06-12 00:43:35,-73.980415,40.738564,Sunday,0
2,2016-01-19 11:35:24,-73.979027,40.763939,Tuesday,11
3,2016-04-06 19:32:31,-74.01004,40.719971,Wednesday,19
4,2016-03-26 13:30:55,-73.973053,40.793209,Saturday,13


In [36]:
clus = train.drop(columns=["pickup_datetime", "day", "hour" ])

# The following were used to perform the K-Means Algorithm on a specific day or time
#clus = train.loc[train['day'] == "Monday"].drop(columns=["pickup_datetime", "day", "hour" ])
#clus = train.loc[train['hour'] == 15].drop(columns=["pickup_datetime", "day", "hour" ])

Filtering out any outliers by using z-scores

In [37]:
clus.shape

(1458644, 2)

In [38]:
z_scores = stats.zscore(clus)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 4).all(axis=1)
clus = clus[filtered_entries]

In [39]:
clus.shape

(1458041, 2)

Using the K-Means Algorithmm to cluster <br>
Please note this will take a few minutes to execute because of the large amount of clusters and the large dataset

In [9]:
kmeans = KMeans(n_clusters = 45, max_iter = 300, random_state = 123 )
kmeans.fit(clus)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=45, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=123, tol=0.0001, verbose=0)

In [10]:
centroids = kmeans.cluster_centers_

Saving the centroids to a .csv file

In [11]:
centroids = pd.DataFrame({'Latitude': centroids[:, 1], 'Longitude': centroids[:, 0]})
centroids.to_csv('./data/centroids.csv', index=False)

Plotting the centroids on an interactive map

In [12]:
centroids_layer = gmaps.symbol_layer(
    centroids, fill_color='green', stroke_color='green', scale=2
)

In [13]:
fig = gmaps.figure()
fig.add_layer(centroids_layer)
fig

Figure(layout=FigureLayout(height='420px'))