In [1]:
import pandas as pd
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"

## Data preparation

According to all data files we can choose any month from 2014 that we need

In [2]:
def get_data_for_month(month_name):
    return pd.read_csv(f"uber-raw-data-{month_name}14.csv")

month_data = get_data_for_month('jul')
month_data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,7/1/2014 0:03:00,40.7586,-73.9706,B02512
1,7/1/2014 0:05:00,40.7605,-73.9994,B02512
2,7/1/2014 0:06:00,40.732,-73.9999,B02512
3,7/1/2014 0:09:00,40.7635,-73.9793,B02512
4,7/1/2014 0:20:00,40.7204,-74.0047,B02512


As we have in our dataset "Base", so according to the number of bases we can define number of clusters that we'll use.  So that's how we can plan how bases should be located for the future to make work faster

In [3]:
def count_base(df):
    base_count = df["Base"].nunique()
    return base_count
num_base = count_base(month_data)
num_base

5

In the dataset we have an uncomfortable column "Date/Time", to make it easier to use we need to split "Date" and "Time"

In [4]:
def split_date_time(df):
    df[["Date", "Time"]] = df["Date/Time"].str.split(" ", expand=True)
    df = df.drop('Date/Time', axis=1)
    return df

month_data = split_date_time(month_data)
month_data.head()

Unnamed: 0,Lat,Lon,Base,Date,Time
0,40.7586,-73.9706,B02512,7/1/2014,0:03:00
1,40.7605,-73.9994,B02512,7/1/2014,0:05:00
2,40.732,-73.9999,B02512,7/1/2014,0:06:00
3,40.7635,-73.9793,B02512,7/1/2014,0:09:00
4,40.7204,-74.0047,B02512,7/1/2014,0:20:00


The same split function we can do on time to split it into hours, minutes and seconds

In [5]:
def split_hours(df):
    df[["Hour", "Minutes", "Seconds"]] = df["Time"].str.split(":", expand=True)
    df = df.drop('Time', axis=1)
    return df

month_data = split_hours(month_data)
month_data.head()

Unnamed: 0,Lat,Lon,Base,Date,Hour,Minutes,Seconds
0,40.7586,-73.9706,B02512,7/1/2014,0,3,0
1,40.7605,-73.9994,B02512,7/1/2014,0,5,0
2,40.732,-73.9999,B02512,7/1/2014,0,6,0
3,40.7635,-73.9793,B02512,7/1/2014,0,9,0
4,40.7204,-74.0047,B02512,7/1/2014,0,20,0


Our data contains information on the dates, but it's not very easy information to do clusters on, as this dates really don't give us a lot. So we can categorize them into days of the week

In [6]:
def define_day_week(df):
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['day_week'] = df['Date'].dt.day_name()
    return df

month_data = define_day_week(month_data)
month_data.head()

Unnamed: 0,Lat,Lon,Base,Date,Hour,Minutes,Seconds,day_week
0,40.7586,-73.9706,B02512,2014-07-01,0,3,0,Tuesday
1,40.7605,-73.9994,B02512,2014-07-01,0,5,0,Tuesday
2,40.732,-73.9999,B02512,2014-07-01,0,6,0,Tuesday
3,40.7635,-73.9793,B02512,2014-07-01,0,9,0,Tuesday
4,40.7204,-74.0047,B02512,2014-07-01,0,20,0,Tuesday


We can also simplify our code if we categorize hours too into day time

In [7]:
def define_day_time(row):
    hour = row["Hour"]
    if (hour == '0' or hour == '1' or hour == '2' or hour == '3' or hour == '4' or hour == '5'):
        return 'night'
    elif (hour == '6' or hour == '7' or hour == '8' or hour == '9' or hour == '10' or hour == '11'):
        return 'morning'
    elif (hour == '12' or hour == '13' or hour == '14'):
        return 'noon'
    elif (hour == '15' or hour == '16' or hour == '17'):
        return 'afternoon'
    else:
        return 'evening'

def create_day_time(df):
    df['day_time'] = df.apply(define_day_time, axis=1)
    return df

create_day_time(month_data)

Unnamed: 0,Lat,Lon,Base,Date,Hour,Minutes,Seconds,day_week,day_time
0,40.7586,-73.9706,B02512,2014-07-01,0,03,00,Tuesday,night
1,40.7605,-73.9994,B02512,2014-07-01,0,05,00,Tuesday,night
2,40.7320,-73.9999,B02512,2014-07-01,0,06,00,Tuesday,night
3,40.7635,-73.9793,B02512,2014-07-01,0,09,00,Tuesday,night
4,40.7204,-74.0047,B02512,2014-07-01,0,20,00,Tuesday,night
...,...,...,...,...,...,...,...,...,...
796116,40.7285,-73.9846,B02764,2014-07-31,23,22,00,Thursday,evening
796117,40.7615,-73.9868,B02764,2014-07-31,23,23,00,Thursday,evening
796118,40.6770,-73.9515,B02764,2014-07-31,23,29,00,Thursday,evening
796119,40.7225,-74.0038,B02764,2014-07-31,23,30,00,Thursday,evening


Very possible that our dataset contains a lot of data, so we need to check if it would be better to take a sample of a smaller amount of observations

In [8]:
def get_count(df):
    count = len(df.index)
    if count > 500000: 
        return 100000
    elif num > 100000:
        return 50000
    elif num > 10000:
         return 5000
    else:
        return count

def get_sample(df):
    x = get_count(month_data)
    month_data_sample = df.sample(x)
    return month_data_sample

get_sample(month_data)

Unnamed: 0,Lat,Lon,Base,Date,Hour,Minutes,Seconds,day_week,day_time
764306,40.7684,-73.8626,B02682,2014-07-28,17,05,00,Monday,afternoon
267681,40.7272,-73.9983,B02598,2014-07-30,18,16,00,Wednesday,evening
317928,40.6947,-73.9489,B02617,2014-07-07,9,58,00,Monday,morning
675624,40.7419,-73.9853,B02682,2014-07-15,14,20,00,Tuesday,noon
339616,40.7607,-73.9747,B02617,2014-07-09,20,07,00,Wednesday,evening
...,...,...,...,...,...,...,...,...,...
699349,40.7657,-73.9764,B02682,2014-07-18,16,40,00,Friday,afternoon
203847,40.6582,-73.9912,B02598,2014-07-23,6,32,00,Wednesday,morning
734575,40.7419,-74.0046,B02682,2014-07-23,22,42,00,Wednesday,evening
328489,40.7512,-73.9825,B02617,2014-07-08,17,55,00,Tuesday,afternoon


### Getting day of the week and day time

Now, as we preprocessed all the dataset, we can start filtering it by a day of the week and a day time

In [9]:
def get_date_week(df, day, time):
    data_day = df.loc[(df['day_week'] == day) & (df['day_time'] == time )]
    data_day = data_day.loc[:,["Lon", "Lat"]]
    return data_day

data_day = get_date_week(month_data, "Tuesday", "noon")
data_day

Unnamed: 0,Lon,Lat
296,-74.0098,40.7218
297,-73.9563,40.7716
298,-74.0072,40.7280
299,-74.0037,40.7325
300,-73.9882,40.7355
...,...,...
795250,-73.9450,40.7178
795251,-73.9938,40.7515
795252,-73.9719,40.7646
795253,-74.0069,40.7307


### Clustering KMeans

In [10]:
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_base, random_state=42)

sc = StandardScaler()

def transform_data(df):
    X = sc.fit_transform(df)
    return X

X = transform_data(data_day)
X

array([[-0.58774824, -0.53400586],
       [ 0.34164182,  0.75141637],
       [-0.54258162, -0.37397338],
       ...,
       [ 0.0706421 ,  0.57073453],
       [-0.53737008, -0.30428181],
       [-0.25768448,  0.07773122]])

In [11]:
def cluster_kmeans(array):
    return kmeans.fit(array)
cluster_kmeans(X)   

In [12]:
def get_cluster_labels(df):
    df.loc[:, 'cluster_km'] = kmeans.labels_
    return df
data_day = get_cluster_labels(data_day)

In [13]:
def get_cluster_map_km(df):
    cluster_map = px.scatter_mapbox(
    df,
    lat="Lat",
    lon="Lon",
    color="cluster_km",
    mapbox_style="carto-positron")
    return cluster_map
get_cluster_map_km(data_day)

### Clustering DBSCAN

In [14]:
from sklearn.cluster import DBSCAN
import numpy as np

db = DBSCAN(eps=0.3, min_samples=100, metric="euclidean")

def cluster_dbscan(array):
    model = db.fit(array)
    return model
cluster_dbscan(X)

In [15]:
def get_cluster_labels(df):
    label = db.labels_
    df.loc[:, 'cluster_db'] = label
    return df
get_cluster_labels(data_day)

Unnamed: 0,Lon,Lat,cluster_km,cluster_db
296,-74.0098,40.7218,1,0
297,-73.9563,40.7716,0,0
298,-74.0072,40.7280,1,0
299,-74.0037,40.7325,1,0
300,-73.9882,40.7355,1,0
...,...,...,...,...
795250,-73.9450,40.7178,1,0
795251,-73.9938,40.7515,0,0
795252,-73.9719,40.7646,0,0
795253,-74.0069,40.7307,1,0


In [16]:
def get_cluster_map_db(df):
    cluster_map = px.scatter_mapbox(
    df,
    lat="Lat",
    lon="Lon",
    color="cluster_db",
    mapbox_style="carto-positron")
    return cluster_map
get_cluster_map_db(data_day)