In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
import warnings

from datetime import datetime, timedelta


warnings.filterwarnings("ignore", category=FutureWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from sklearn.datasets import load_sample_image
import matplotlib.pyplot as plt


from sklearn.cluster import KMeans, DBSCAN
import os


In [2]:
path = "./data/01aJourneyDataExtract10Jan16-23Jan16.csv"
dataset = pd.read_csv(path)
dataset.head()

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,50754225,240,11834,10/01/2016 00:04,383.0,"Frith Street, Soho",10/01/2016 00:00,18,"Drury Lane, Covent Garden"
1,50754226,300,9648,10/01/2016 00:05,719.0,"Victoria Park Road, Hackney Central",10/01/2016 00:00,479,"Pott Street, Bethnal Green"
2,50754227,1200,10689,10/01/2016 00:20,272.0,"Baylis Road, Waterloo",10/01/2016 00:00,425,"Harrington Square 2, Camden Town"
3,50754228,780,8593,10/01/2016 00:14,471.0,"Hewison Street, Old Ford",10/01/2016 00:01,487,"Canton Street, Poplar"
4,50754229,600,8619,10/01/2016 00:11,399.0,"Brick Lane Market, Shoreditch",10/01/2016 00:01,501,"Cephas Street, Bethnal Green"


In [3]:
df = dataset
df[['End Date', 'Start Date']] = dataset[['End Date', 'Start Date']].apply(pd.to_datetime, dayfirst=True).apply(lambda x: x.dt.time)
df.head()

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,50754225,240,11834,00:04:00,383.0,"Frith Street, Soho",00:00:00,18,"Drury Lane, Covent Garden"
1,50754226,300,9648,00:05:00,719.0,"Victoria Park Road, Hackney Central",00:00:00,479,"Pott Street, Bethnal Green"
2,50754227,1200,10689,00:20:00,272.0,"Baylis Road, Waterloo",00:00:00,425,"Harrington Square 2, Camden Town"
3,50754228,780,8593,00:14:00,471.0,"Hewison Street, Old Ford",00:01:00,487,"Canton Street, Poplar"
4,50754229,600,8619,00:11:00,399.0,"Brick Lane Market, Shoreditch",00:01:00,501,"Cephas Street, Bethnal Green"


In [8]:
morning = "09:00:00"
evening = "17:00:00"
morning_time = datetime.strptime(morning, "%H:%M:%S").time()
evening_time = datetime.strptime(evening, "%H:%M:%S").time()
print(morning_time, evening_time)

09:00:00 17:00:00


In [22]:
# Calcul de morning_start et morning_end
morning_start = (datetime.combine(datetime.today(), morning_time) - timedelta(hours=1)).time()
morning_end = (datetime.combine(datetime.today(), morning_time) + timedelta(hours=1)).time()

df_morning = df[(df['Start Date'] >= morning_start) & (df['Start Date'] <= morning_end)]
df_morning = df_morning.reset_index(drop=True)
df_morning

Unnamed: 0,Rental Id,Duration,Bike Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name
0,50755147,1140,740,08:19:00,308.0,"Long Lane , Bermondsey",08:00:00,546,"New Fetter Lane, Holborn"
1,50755148,360,9884,08:06:00,114.0,"Park Road (Baker Street), The Regent's Park",08:00:00,540,"Albany Street, The Regent's Park"
2,50755149,300,10063,08:06:00,320.0,"Queen Mother Sports Centre, Victoria",08:01:00,74,"Vauxhall Cross, Vauxhall"
3,50755150,240,10010,08:06:00,638.0,"Falcon Road, Clapham Junction",08:02:00,701,"Vicarage Crescent, Battersea"
4,50755151,480,8468,08:10:00,247.0,"St. John's Wood Church, The Regent's Park",08:02:00,535,"Gloucester Avenue, Camden Town"
...,...,...,...,...,...,...,...,...,...
58535,51037465,300,1552,10:05:00,281.0,"Smith Square, Westminster",10:00:00,270,"Kennington Lane Rail Bridge, Vauxhall"
58536,51037968,120,10058,09:28:00,644.0,"Rainville Road, Hammersmith",09:26:00,682,"Crisp Road, Hammersmith"
58537,51038826,1320,1503,09:01:00,313.0,"Wells Street, Fitzrovia",08:39:00,661,"All Saints Church, Portobello"
58538,51038827,300,9179,09:18:00,225.0,"Notting Hill Gate Station, Notting Hill",09:13:00,661,"All Saints Church, Portobello"


In [23]:
# Elimination colones superflues
df_morning = df_morning.drop(["Duration", "End Date","EndStation Id", "EndStation Name"], axis=1)
df_morning

Unnamed: 0,Rental Id,Bike Id,Start Date,StartStation Id,StartStation Name
0,50755147,740,08:00:00,546,"New Fetter Lane, Holborn"
1,50755148,9884,08:00:00,540,"Albany Street, The Regent's Park"
2,50755149,10063,08:01:00,74,"Vauxhall Cross, Vauxhall"
3,50755150,10010,08:02:00,701,"Vicarage Crescent, Battersea"
4,50755151,8468,08:02:00,535,"Gloucester Avenue, Camden Town"
...,...,...,...,...,...
58535,51037465,1552,10:00:00,270,"Kennington Lane Rail Bridge, Vauxhall"
58536,51037968,10058,09:26:00,682,"Crisp Road, Hammersmith"
58537,51038826,1503,08:39:00,661,"All Saints Church, Portobello"
58538,51038827,9179,09:13:00,661,"All Saints Church, Portobello"


In [24]:
df_morning.columns

Index(['Rental Id', 'Bike Id', 'Start Date', 'StartStation Id',
       'StartStation Name'],
      dtype='object')