# Import and preprocessing

In [None]:
# Import data and libraries

import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Jedha_Fullstack/Machine_Learning_Projects/Uber_Project/uber-trip-data/uber-trip-data/uber-raw-data-may14.csv")
data

Unnamed: 0,Date/Time,Lat,Lon,Base
0,5/1/2014 0:02:00,40.7521,-73.9914,B02512
1,5/1/2014 0:06:00,40.6965,-73.9715,B02512
2,5/1/2014 0:15:00,40.7464,-73.9838,B02512
3,5/1/2014 0:17:00,40.7463,-74.0011,B02512
4,5/1/2014 0:17:00,40.7594,-73.9734,B02512
...,...,...,...,...
652430,5/31/2014 23:45:00,40.7309,-74.0014,B02764
652431,5/31/2014 23:52:00,40.7528,-73.9798,B02764
652432,5/31/2014 23:55:00,40.7158,-73.9519,B02764
652433,5/31/2014 23:56:00,40.6961,-73.8997,B02764


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652435 entries, 0 to 652434
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  652435 non-null  object 
 1   Lat        652435 non-null  float64
 2   Lon        652435 non-null  float64
 3   Base       652435 non-null  object 
dtypes: float64(2), object(2)
memory usage: 19.9+ MB


In [None]:
# Initial map viz

from plotly import express as px

sample_size = int(len(data)*0.3)
data_sample = data.sample(sample_size)

fig = px.scatter_mapbox(data_sample, lat="Lat", lon="Lon", mapbox_style="open-street-map")

fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Breakdown of date and time information

data['Date/Time'] = pd.to_datetime(data['Date/Time'], format='%m/%d/%Y %H:%M:%S')
data['Year'] = data['Date/Time'].dt.year
data['Month'] = data['Date/Time'].dt.month
data['Day'] = data['Date/Time'].dt.day
data['Weekday'] = data['Date/Time'].dt.dayofweek
data['Hour'] = data['Date/Time'].dt.hour

data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Weekday,Hour
0,2014-05-01 00:02:00,40.7521,-73.9914,B02512,2014,5,1,3,0
1,2014-05-01 00:06:00,40.6965,-73.9715,B02512,2014,5,1,3,0
2,2014-05-01 00:15:00,40.7464,-73.9838,B02512,2014,5,1,3,0
3,2014-05-01 00:17:00,40.7463,-74.0011,B02512,2014,5,1,3,0
4,2014-05-01 00:17:00,40.7594,-73.9734,B02512,2014,5,1,3,0


# Hot-zones depending on the weekday

## DBSCAN clustering

In [None]:
# Sampling

data_sample = data.sample(sample_size)
data_dow = data_sample.loc[:, ["Lat", "Lon", "Weekday"]]
data_dow.head()

Unnamed: 0,Lat,Lon,Weekday
345016,40.7366,-73.9891,2
402143,40.7333,-73.9998,2
120563,40.7741,-73.8729,5
558707,40.6828,-73.9617,0
589448,40.6799,-73.9815,4


In [None]:
# Dividing the global dataset into datasets by weekday

weekdays = np.sort(data_dow["Weekday"].unique())
data_by_weekday = {}

for weekday in weekdays:
    filtered_data = data_dow[data_dow['Weekday'] == weekday]
    data_by_weekday[weekday] = filtered_data

In [None]:
# Preprocessing and DBSCAN clustering for each weekday dataset

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
dbscan1 = DBSCAN(eps=0.2, min_samples=100)

for weekday, df in data_by_weekday.items():
    X = scaler.fit_transform(df[['Lat', 'Lon']])

    dbscan1.fit(X)

    df.loc[:, 'Cluster DBSCAN'] = dbscan1.labels_.astype(str)

In [None]:
# Maps of clusters by weekday

for weekday, df in data_by_weekday.items():
    fig_map = px.scatter_mapbox(
        df,
        lat='Lat',
        lon='Lon',
        color='Cluster DBSCAN',
        mapbox_style='open-street-map',
        title=f'Clusters for Weekday : {weekday+1}',
        zoom=9
    )

    fig_map.show()

Output hidden; open in https://colab.research.google.com to view.

## KMeans clustering

In [None]:
# Preprocessing and KMeans clustering for each weekday dataset

from sklearn.cluster import KMeans

scaler = StandardScaler()
kmeans = KMeans(n_clusters=4, n_init='auto')

for weekday, df in data_by_weekday.items():
    X = scaler.fit_transform(df[['Lat', 'Lon']])

    kmeans.fit(X)

    df.loc[:, 'Cluster KMeans'] = kmeans.labels_.astype(str)

In [None]:
# Maps of clusters by weekday

for weekday, df in data_by_weekday.items():
    fig_map = px.scatter_mapbox(
        df,
        lat='Lat',
        lon='Lon',
        color='Cluster KMeans',
        mapbox_style='open-street-map',
        title=f'Clusters for Weekday : {weekday+1}',
        zoom=9
    )

    fig_map.show()

Output hidden; open in https://colab.research.google.com to view.