In [2]:
# Import des librairies
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.metrics import  silhouette_score
from sklearn.decomposition import PCA

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## 1- Download dataset and create a sample

In [3]:
# Import csv
df_apr14 = pd.read_csv(r'uber-raw-data-apr14.csv')
df_apr14.shape

(564516, 4)

The dataset is really big. We need to proceed with a sample of the dataset.

In [4]:
# Create a sample
df_apr14_sample = df_apr14.sample(frac=0.05, random_state=42)
df_apr14_sample.shape

(28226, 4)

## 2- Cleaning

In [5]:
# Convert "Date" columns un date format
date_format = '%m/%d/%Y %H:%M:%S'

df_apr14_sample['Date/Time'] = df_apr14_sample['Date/Time'].apply(lambda x: datetime.strptime(x, date_format))
df_apr14_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28226 entries, 77202 to 473256
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  28226 non-null  datetime64[ns]
 1   Lat        28226 non-null  float64       
 2   Lon        28226 non-null  float64       
 3   Base       28226 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 1.1+ MB


In [6]:
# Create columns for "hour", "dayofweek" and "dayofmonth"
df_apr14_sample['hour'] = df_apr14_sample['Date/Time'].dt.hour
df_apr14_sample['dayofweek'] = df_apr14_sample['Date/Time'].dt.weekday
df_apr14_sample['dayofmonth'] = df_apr14_sample['Date/Time'].dt.day

df_apr14_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,hour,dayofweek,dayofmonth
77202,2014-04-09 10:21:00,40.8021,-73.9654,B02598,10,2,9
558915,2014-04-14 04:55:00,40.6462,-73.7769,B02764,4,0,14
152635,2014-04-23 09:52:00,40.7747,-73.9603,B02598,9,2,23
361259,2014-04-04 23:32:00,40.715,-74.0157,B02682,23,4,4
60087,2014-04-05 19:57:00,40.7335,-74.008,B02598,19,5,5


In [7]:
# Drop useless columns
useless_cols = ['Base', 'Date/Time']

print("Dropping useless columns...")
df_apr14_sample = df_apr14_sample.drop(useless_cols, axis=1) 

print("...Done.")
df_apr14_sample.head()

Dropping useless columns...
...Done.


Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth
77202,40.8021,-73.9654,10,2,9
558915,40.6462,-73.7769,4,0,14
152635,40.7747,-73.9603,9,2,23
361259,40.715,-74.0157,23,4,4
60087,40.7335,-74.008,19,5,5


In [8]:
# Basic stats
print("Number of rows : {}".format(df_apr14_sample.shape[0]))
print()

print("Display of dataset: ")
display(df_apr14_sample.head())
print()

print("Basics statistics: ")
data_desc = df_apr14_sample.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df_apr14_sample.isnull().sum()/df_apr14_sample.shape[0])

Number of rows : 28226

Display of dataset: 


Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth
77202,40.8021,-73.9654,10,2,9
558915,40.6462,-73.7769,4,0,14
152635,40.7747,-73.9603,9,2,23
361259,40.715,-74.0157,23,4,4
60087,40.7335,-74.008,19,5,5



Basics statistics: 


Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth
count,28226.0,28226.0,28226.0,28226.0,28226.0
mean,40.740057,-73.976597,14.399242,2.865514,16.095479
std,0.035326,0.050493,5.867555,1.817263,9.053291
min,40.557,-74.6137,0.0,0.0,1.0
25%,40.7226,-73.9975,10.0,1.0,8.0
50%,40.7425,-73.9846,16.0,3.0,16.0
75%,40.761075,-73.9701,19.0,4.0,24.0
max,41.2339,-72.7006,23.0,6.0,30.0



Percentage of missing values: 


Lat           0.0
Lon           0.0
hour          0.0
dayofweek     0.0
dayofmonth    0.0
dtype: float64

## 3- EDA

In [9]:
# Create a dataframe
df_apr14_sample_sorted = df_apr14_sample.sort_values('dayofmonth')

# Create a scatter mapbox graph
fig = px.scatter_mapbox(
    df_apr14_sample_sorted,
    lat="Lat",
    lon="Lon",
    animation_frame="dayofmonth",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="Uber pickups throughout the days of the month"
)

fig.show()

## 4- Preprocessing

In [10]:
# Create pipeline for numeric features
numeric_features = ['Lat', 'Lon', 'hour', 'dayofweek', 'dayofmonth']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

print('... Done.')

#Prepocessing
preprocessor = ColumnTransformer(
    transformers=[(
        'num', numeric_transformer, numeric_features)
])

# Test pipeline
print("Preprocessing X...")
print(df_apr14_sample.head())
print()
X = preprocessor.fit_transform(df_apr14_sample)
print("...Done!")
print(X[0:5, :]) 
print()

... Done.
Preprocessing X...
            Lat      Lon  hour  dayofweek  dayofmonth
77202   40.8021 -73.9654    10          2           9
558915  40.6462 -73.7769     4          0          14
152635  40.7747 -73.9603     9          2          23
361259  40.7150 -74.0157    23          4           4
60087   40.7335 -74.0080    19          5           5

...Done!
[[ 1.75633997  0.22175354 -0.74977054 -0.47628189 -0.78375974]
 [-2.65692327  3.95504191 -1.77236111 -1.57685763 -0.23146461]
 [ 0.98069268  0.32276028 -0.9202023  -0.47628189  0.76266663]
 [-0.70931255 -0.7744502   1.46584237  0.62429385 -1.33605487]
 [-0.18560909 -0.62194983  0.78411532  1.17458172 -1.22559584]]



## 5- KMeans

5.1- Pipeline Kmeans

In [11]:
# Pipeline for K-means
print("Preprocessing X_kmeans...")
print(df_apr14_sample.head())
print()
X_kmeans = preprocessor.fit_transform(df_apr14_sample)
print("...Done!")
print(X_kmeans[0:5, :]) 
print()

Preprocessing X_kmeans...
            Lat      Lon  hour  dayofweek  dayofmonth
77202   40.8021 -73.9654    10          2           9
558915  40.6462 -73.7769     4          0          14
152635  40.7747 -73.9603     9          2          23
361259  40.7150 -74.0157    23          4           4
60087   40.7335 -74.0080    19          5           5

...Done!
[[ 1.75633997  0.22175354 -0.74977054 -0.47628189 -0.78375974]
 [-2.65692327  3.95504191 -1.77236111 -1.57685763 -0.23146461]
 [ 0.98069268  0.32276028 -0.9202023  -0.47628189  0.76266663]
 [-0.70931255 -0.7744502   1.46584237  0.62429385 -1.33605487]
 [-0.18560909 -0.62194983  0.78411532  1.17458172 -1.22559584]]



5.2- Elbow Method

In [12]:
# Elbow Method
wcss = []
for i in range (2,11):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
    kmeans.fit(X_kmeans)
    wcss.append(kmeans.inertia_)

print(wcss)

[119273.75864954712, 104425.37553051676, 89108.84833961111, 78316.16350870588, 72068.5473756335, 66575.0862513973, 62253.61079636144, 58673.76727728972, 54826.7908020702]


In [13]:
# Create a line graph
fig = px.line(x=range(2,11), y=wcss)

fig.update_layout(yaxis_title='Inertia',
                  xaxis_title='Clusters',
                  title_text="Elbow Method"
)

fig.show()

The graph of the elbow method shows a curve that begins to bend at a certain point, suggesting that the optimal number of clusters is probably around this point. In our case, the elbow seems to form at 5. However, it is important to note that the selection of the final number of clusters may also depend on other factors and further analysis.

5.3- Silhouette method

In [14]:
# silhouette method 
s_score = []
for i in range (2,11):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
    kmeans.fit(X_kmeans)
    s_score.append(silhouette_score(X_kmeans, kmeans.predict(X_kmeans)))

print(s_score)

[0.1865376676909686, 0.2056749291338296, 0.2018873359317302, 0.20963424683389004, 0.20991185574909524, 0.20395174777792985, 0.20822052605549388, 0.20906572487695974, 0.2171447742712299]


In [15]:
# Create a bar chart
fig = px.bar(x=range(2,11), y=s_score)

fig.update_layout(yaxis_title='Silhouette score',
                  xaxis_title='Clusters',
                  title_text="Silhouette Method"
)

fig.show()

By examining the graph of the silhouette method, we can observe that the silhouette score is significatively high when the number of clusters is 6, indicating a good separation of clusters. However, it is recommended to consider other criteria and check the stability of the clusters to make a final decision on the optimal number of clusters.

Elbow and silhouette methods provide useful guidance on the optimal number of clusters for our dataset. Considering both approaches, the optimal number seems to be 6.

In [16]:
#Model
kmeans = KMeans(n_clusters= 6,n_init=10, random_state=0)
kmeans.fit(X_kmeans)

In [17]:
# Create a new column from Kmeans cluster
df_apr14_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X_kmeans)
df_apr14_sample.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth,Cluster_KMeans
77202,40.8021,-73.9654,10,2,9,0
558915,40.6462,-73.7769,4,0,14,5
152635,40.7747,-73.9603,9,2,23,0
361259,40.715,-74.0157,23,4,4,3
60087,40.7335,-74.008,19,5,5,1


In [18]:
# Create a dataframe
df_apr14_sample_sorted = df_apr14_sample[df_apr14_sample['Cluster_KMeans'] != -1].sort_values('hour')

# Create a scatter mapbox graph
fig = px.scatter_mapbox(
    df_apr14_sample_sorted,
    lat="Lat",
    lon="Lon",
    color= 'Cluster_KMeans',
    animation_frame="hour",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="Spatial distribution of clusters over hours of the days")

fig.show()

K-means has identified six distinct clusters.

- Cluster 0: Corresponds to the geographical area of Manhattan around and north of Central Park. This area has a distinct cluster, suggesting unique features. Exploring the reasons for this difference could provide interesting information.

- Clusters 1, 2, and 3: Correspond to the same geographical area, but with a time slot from 9am to 11pm. This area is dense and very active, reflecting the strong economic activity of Manhattan.

- Cluster 4: Corresponds to the geographical area of Manhattan south of Central Park, but is present only between midnight and 8am, suggesting significant activity at night, associated with the nightlife of the neighborhood.

- Cluster 5: Corresponds to the JFK area. This cluster has constant stability regardless of time. Low activity can be attributed to the less dense nature of the area, making it useful for identifying less active areas.

In conclusion, although K-means identified six clusters in the dataset, it did not distinguish clearly usable clusters. We will explore another approach with DBSCAN for a better understanding.

## 6- DBSCAN

6.1- Cleaning and preprocessing for DBSCAN

In [19]:
# Drop useless columns
useless_cols = ['Cluster_KMeans']

print("Dropping useless columns...")
df_apr14_sample = df_apr14_sample.drop(useless_cols, axis=1)

print("...Done.")

Dropping useless columns...
...Done.


In [20]:
# Preprocessing for DBSCAN
print("Preprocessing X_dbscan...")
print(df_apr14_sample.head())
print()
X_dbscan =  preprocessor.fit_transform(df_apr14_sample)
print("...Done!")
print(X_dbscan[0:5, :]) 
print()

Preprocessing X_dbscan...
            Lat      Lon  hour  dayofweek  dayofmonth
77202   40.8021 -73.9654    10          2           9
558915  40.6462 -73.7769     4          0          14
152635  40.7747 -73.9603     9          2          23
361259  40.7150 -74.0157    23          4           4
60087   40.7335 -74.0080    19          5           5

...Done!
[[ 1.75633997  0.22175354 -0.74977054 -0.47628189 -0.78375974]
 [-2.65692327  3.95504191 -1.77236111 -1.57685763 -0.23146461]
 [ 0.98069268  0.32276028 -0.9202023  -0.47628189  0.76266663]
 [-0.70931255 -0.7744502   1.46584237  0.62429385 -1.33605487]
 [-0.18560909 -0.62194983  0.78411532  1.17458172 -1.22559584]]



In [21]:
# Instanciate DBSCAN
db = DBSCAN(eps=0.72, min_samples=15, metric='manhattan')
db.fit(X_dbscan)

labels = db.labels_
np.unique(db.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20], dtype=int64),
 array([4456, 5649, 4704, 5078, 4675, 3291,   53,   34,    8,   30,   21,
          17,   21,   46,   24,   20,   15,    7,   13,   22,   18,   24],
       dtype=int64))

The results show that ϵ=0.72 offers a segmentation that seems to represent the different spatiotemporal patterns present in the data.
A value of min_samples= 15 was retained, which seems to produce significant clusters while avoiding excessive segmentation. Thus we observe 21 clusters for an outlier number of 4456.

It is important to note that the choice of parameters in DBSCAN is often a compromise between noise sensitivity, cluster size and the ability to detect significant patterns.

In [22]:
# Create a new column from DBSCAN cluster
df_apr14_sample.loc[:, "Cluster_DBSCAN"] = labels
df_apr14_sample.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth,Cluster_DBSCAN
77202,40.8021,-73.9654,10,2,9,-1
558915,40.6462,-73.7769,4,0,14,-1
152635,40.7747,-73.9603,9,2,23,0
361259,40.715,-74.0157,23,4,4,1
60087,40.7335,-74.008,19,5,5,1


In [23]:
# Create a dataframe
df_apr14_sample_sorted = df_apr14_sample[df_apr14_sample['Cluster_DBSCAN'] != -1].sort_values('hour')

# Create a scatter mapbox graph
fig = px.scatter_mapbox(
    df_apr14_sample_sorted,
    lat="Lat",
    lon="Lon",
    color='Cluster_DBSCAN',
    animation_frame="hour",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="Spatial distribution of clusters over hours of the day"
)

fig.show()

Clusters identified by DBSCAN reveal significant spatiotemporal patterns in Uber taxi movements in New York. The geographic area around and south of Manhattan presents constant activity throughout the day, suggesting a strong influence of Manhattan’s nightlife. The presence of several clusters in this region highlights the diversity of reasons for displacement.
In the northern zone of Brooklyn, a single cluster is observed during a restricted time slot (5am to 8am), suggesting a possible correlation with commuting. A more in-depth investigation could inform this observation. The areas around JFK  and LaGuardia airports have clusters corresponding to air traffic schedules, highlighting the direct impact of airport activities on taxi movements.

## 7- Conclusion

Comparatively, although K-means has identified six clusters, DBSCAN with its 20 clusters seems to provide more detailed and accurate segmentation, particularly by highlighting areas that could be overlooked in a more global analysis. These results underline the importance of using clustering methods adapted to the complexity of the data and the reasons we seek to identify.

In order to further explore the relationships between the different variables and detect any subtle correlations, we plan to apply Principal Component Analysis (PCA). This approach should allow us to gain additional insight into the underlying data structure and enrich our understanding of Uber taxi travel patterns across New York.

## 8- PCA

In [24]:
# Create pipeline for numeric features
df_pca = df_apr14_sample.loc[:, ["hour", "dayofweek", "dayofmonth"]]

# Preprocessing
scaler = StandardScaler()
X_train_pca = scaler.fit_transform(df_pca)

# Iniciate PCA
pca = PCA(n_components=2)
PC_train_opti = pca.fit_transform(X_train_pca)
print("...Done!")

...Done!


In [25]:
# Create 2 columns for PCA1 ans PCA2
df_pca_opti= df_apr14_sample.loc[:, ["Lat", "Lon"]]
df_pca_opti['pca1'] = PC_train_opti[:, 0]
df_pca_opti['pca2'] = PC_train_opti[:, 1]

df_pca_opti

Unnamed: 0,Lat,Lon,pca1,pca2
77202,40.8021,-73.9654,0.322618,0.446798
558915,40.6462,-73.7769,-0.660179,1.522717
152635,40.7747,-73.9603,-0.698195,1.079020
361259,40.7150,-74.0157,1.098634,-1.763526
60087,40.7335,-74.0080,1.534225,-1.049541
...,...,...,...,...
403616,40.7533,-73.9785,0.403728,-0.783910
362182,40.7649,-73.9611,2.006865,1.543384
161663,40.7168,-74.0082,-0.558005,0.172428
466881,40.7371,-73.9791,0.572492,-0.093203


In [26]:
# Create pipeline for numeric features
numeric_features = ['Lat', 'Lon', 'pca1', 'pca2']

# Preprocessing
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)])

# Test pipeline
print("Preprocessing X_pca...")
X_pca = preprocessor.fit_transform(df_pca_opti)
print("...Done!")

Preprocessing X_pca...
...Done!


In [27]:
# Instanciate DBSCAN
db = DBSCAN(eps=0.55, min_samples=6)
db.fit(X_pca)

labels = db.labels_
np.unique(db.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6], dtype=int64),
 array([  810, 26636,   592,   163,     6,     6,     7,     6],
       dtype=int64))

In [28]:
df_apr14_sample.loc[:,'Cluster_DBSCAN'] = db.labels_
df_apr14_sample.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth,Cluster_DBSCAN
77202,40.8021,-73.9654,10,2,9,0
558915,40.6462,-73.7769,4,0,14,1
152635,40.7747,-73.9603,9,2,23,0
361259,40.715,-74.0157,23,4,4,0
60087,40.7335,-74.008,19,5,5,0


In [29]:
df_apr14_sample_sorted = df_apr14_sample[df_apr14_sample['Cluster_DBSCAN'] != -1].sort_values('hour')

fig = px.scatter_mapbox(
    df_apr14_sample_sorted,
    lat="Lat",
    lon="Lon",
    color= 'Cluster_DBSCAN',
    hover_data=["hour", "dayofweek", "dayofmonth"],
    animation_frame="hour",
    mapbox_style="carto-positron"
)
fig.update_layout(
    title="Spatial distribution of clusters over hours of the day"
)

fig.show()

Following the application of DBSCAN on the main components (PCA), we explored the travel patterns of Uber taxis from a new angle. The results obtained confirm the richness of the DBSCAN approach in terms of segmentation, even when the dimensions of the data are reduced by PCA.

Clusters identified after PCA provide a more nuanced perspective of spatiotemporal patterns. Distinct geographic areas continue to emerge, but with increased granularity in cluster distribution.
For example, JFK’s airport areas reveal sub-clusters associated with specific time slots, highlighting significant temporal variations in flight-related travel.

This analysis reinforces the idea that dimension reduction by PCA did not significantly alter DBSCAN’s ability to identify significant clusters. On the contrary, it allowed to highlight additional nuances in the distribution of clusters, thus offering a finer perspective on the displacement patterns.

These results underline the importance of judicious combination of data analysis techniques, adapted to the complexity of the information sought. In the next stage of our analysis, we plan to explore other methods to further refine our understanding of the factors underlying Uber taxi travel patterns across New York.