# Machine Learning Model for Available Bikes and Bikes Stands

## Contents

* [1. Data Processing](#data_pros) 
* [2. Machine Learning Model Implementation](#mlm) 

In [1]:
#import modules needed
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

## 1. Data Processing <a class="anchor" id="data_pros"></a>

#### Read in the tables

In [2]:
#read in weather table
dfweather = pd.read_csv('weather_table.csv', sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
#read in availability table
dfavailability = pd.read_csv('av_table.csv', sep=',\s+', delimiter=',', skipinitialspace=True)

#### Check the number of columns and rows in the tables

In [4]:
dfweather.shape

(1916, 7)

In [5]:
dfavailability.shape

(50000, 4)

#### View the tables

In [6]:
dfweather.head()

Unnamed: 0,description,icon,temp,temp_min,temp_max,humidity,dt
0,broken clouds,04d,9,9,10,66,2021-03-22 14:32:26
1,broken clouds,04d,9,8,9,76,2021-03-22 14:14:56
2,broken clouds,04d,9,8,9,76,2021-03-22 14:00:25
3,broken clouds,04d,9,8,9,76,2021-03-22 13:43:53
4,broken clouds,04d,9,8,9,76,2021-03-22 13:27:37


In [7]:
dfavailability.head()

Unnamed: 0,number,available_bikes,available_bikes_stands,last_update
0,47,6,34,2021-03-22 13:59:38
1,21,16,14,2021-03-22 13:59:38
2,54,12,21,2021-03-22 13:59:32
3,109,11,18,2021-03-22 13:59:31
4,84,19,11,2021-03-22 13:59:25


In [8]:
dfavailability.rename(columns = {'number':'StationNumber'}, inplace = True)

#### Check Feature types and change into appropriate types if needed 

In [9]:
dfavailability.dtypes

StationNumber              int64
available_bikes            int64
available_bikes_stands     int64
last_update               object
dtype: object

In [10]:
dfavailability['last_update']=dfavailability['last_update'].apply(pd.to_datetime, format='%Y/%m/%d')

In [11]:
dfavailability.dtypes

StationNumber                      int64
available_bikes                    int64
available_bikes_stands             int64
last_update               datetime64[ns]
dtype: object

In [12]:
dfweather.dtypes

description    object
icon           object
temp            int64
temp_min        int64
temp_max        int64
humidity        int64
dt             object
dtype: object

In [13]:
dfweather['dt']=dfweather['dt'].apply(pd.to_datetime, format='%Y/%m/%d')


In [14]:
dfweather['description'] = dfweather['description'].astype('category')

In [15]:
#do not need icon for model 
dfweather =dfweather.drop(['icon'], axis=1)

In [16]:
dfweather.dtypes

description          category
temp                    int64
temp_min                int64
temp_max                int64
humidity                int64
dt             datetime64[ns]
dtype: object

#### Check for Missing Data 
-Result: None

In [17]:
dfweather.isnull().sum().sort_values(ascending=False)/len(dfweather)*100

description    0.0
temp           0.0
temp_min       0.0
temp_max       0.0
humidity       0.0
dt             0.0
dtype: float64

In [18]:
dfavailability.isnull().sum().sort_values(ascending=False)/len(dfavailability)*100

StationNumber             0.0
available_bikes           0.0
available_bikes_stands    0.0
last_update               0.0
dtype: float64

#### Check for Duplicate Columns/Rows

-Result: None, will further evaluate below

In [19]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', dfweather.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', dfweather[dfweather.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


In [20]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', dfavailability.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', dfavailability[dfavailability.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


### Descriptive Stats for Both tables - checking for constant columns

- Results: std >0 - no constant columns

In [21]:
dfavailability.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
StationNumber,50000.0,59.96366,33.745736,2.0,31.0,60.0,90.0,117.0
available_bikes,50000.0,11.2569,7.59684,0.0,5.0,11.0,16.0,40.0
available_bikes_stands,50000.0,20.81686,9.774298,0.0,14.0,20.0,28.0,40.0


In [22]:
dfweather.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
temp,1916.0,7.202505,3.01393,-1.0,5.0,7.0,9.0,15.0
temp_min,1916.0,6.412317,3.139096,-3.0,4.0,7.0,9.0,14.0
temp_max,1916.0,8.014092,2.882795,-1.0,6.0,8.0,10.0,16.0
humidity,1916.0,80.850731,10.914654,43.0,75.0,82.0,87.0,100.0


#### Create new intervals columns which converts the datetime colums into 30 min intervals

In [23]:
dfavailability.head()

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,last_update
0,47,6,34,2021-03-22 13:59:38
1,21,16,14,2021-03-22 13:59:38
2,54,12,21,2021-03-22 13:59:32
3,109,11,18,2021-03-22 13:59:31
4,84,19,11,2021-03-22 13:59:25


In [24]:
dfavailability['intervals']=dfavailability['last_update'].dt.round('15min')

In [25]:
dfweather['intervals']=dfweather['dt'].dt.round('15min')

In [26]:
dfweather.head().sort_values('dt')

Unnamed: 0,description,temp,temp_min,temp_max,humidity,dt,intervals
4,broken clouds,9,8,9,76,2021-03-22 13:27:37,2021-03-22 13:30:00
3,broken clouds,9,8,9,76,2021-03-22 13:43:53,2021-03-22 13:45:00
2,broken clouds,9,8,9,76,2021-03-22 14:00:25,2021-03-22 14:00:00
1,broken clouds,9,8,9,76,2021-03-22 14:14:56,2021-03-22 14:15:00
0,broken clouds,9,9,10,66,2021-03-22 14:32:26,2021-03-22 14:30:00


In [27]:
dfavailability.head().sort_values('last_update')

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,last_update,intervals
4,84,19,11,2021-03-22 13:59:25,2021-03-22 14:00:00
3,109,11,18,2021-03-22 13:59:31,2021-03-22 14:00:00
2,54,12,21,2021-03-22 13:59:32,2021-03-22 14:00:00
0,47,6,34,2021-03-22 13:59:38,2021-03-22 14:00:00
1,21,16,14,2021-03-22 13:59:38,2021-03-22 14:00:00


#### Drop Initial Datetime columns as New Intervals Feature will be used to combine the Tables

In [28]:
mergeweather= dfweather.drop(['dt'], axis=1)

In [29]:
mergeavail=dfavailability.drop(['last_update'], axis=1)

#### Sort the Intervals Features before merge

In [30]:
mergeavail.dtypes
mergeweather.dtypes

description          category
temp                    int64
temp_min                int64
temp_max                int64
humidity                int64
intervals      datetime64[ns]
dtype: object

In [31]:
mergeavail.dtypes
mergeweather.dtypes

description          category
temp                    int64
temp_min                int64
temp_max                int64
humidity                int64
intervals      datetime64[ns]
dtype: object

In [32]:
mergeavail=mergeavail.sort_values('intervals')
mergeavail.isnull().sum()

StationNumber             0
available_bikes           0
available_bikes_stands    0
intervals                 0
dtype: int64

In [33]:
mergeweather=mergeweather.sort_values('intervals')
mergeweather.isnull().sum()

description    0
temp           0
temp_min       0
temp_max       0
humidity       0
intervals      0
dtype: int64

In [34]:
BothMerge=pd.merge( mergeavail, mergeweather,how='inner', on='intervals')
# FinalMerge=pd.merge_asof(mergeweather, mergeavail, on='intervals')
# FinalMerge = mergeavail.merge(mergeweather, on='intervals')

In [35]:
BothMerge.dtypes

StationNumber                      int64
available_bikes                    int64
available_bikes_stands             int64
intervals                 datetime64[ns]
description                     category
temp                               int64
temp_min                           int64
temp_max                           int64
humidity                           int64
dtype: object

In [36]:
BothMerge.isnull().sum().sort_values(ascending=False)/len(BothMerge)*100

StationNumber             0.0
available_bikes           0.0
available_bikes_stands    0.0
intervals                 0.0
description               0.0
temp                      0.0
temp_min                  0.0
temp_max                  0.0
humidity                  0.0
dtype: float64

In [37]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', BothMerge.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', BothMerge[BothMerge.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  12831
Number of duplicate rows (including first) in the table is: 25630


In [38]:
FinalMerge=BothMerge[~BothMerge.duplicated()]

In [39]:
FinalMerge.shape

(37177, 9)

#### Create a Day Feature from the new merged table so that it is used for the model

In [40]:
FinalMerge['Day']=FinalMerge['intervals'].dt.day_name()

#### 2 models will be created, one for Available Bikes and one for available bike stands - create 2 copies of merged df

In [41]:
availableBikes=FinalMerge.copy()
availableStands=FinalMerge.copy()

In [42]:
availableStands.shape

(37177, 10)

#### Create a new feature which states whether stands are available

In [43]:
availableStands['Hour']=availableStands['intervals'].dt.hour
availableStands['Stands_available']= np.where(availableStands['available_bikes_stands']>0, availableStands['available_bikes_stands'], 0)

In [44]:
choices=['Morning', 'Afternoon', 'Evening']
conditions=[(availableStands['Hour'] > 6) & (availableStands['Hour'] < 12), (availableStands['Hour'] >= 12) & (availableStands['Hour'] < 16),(availableStands['Hour'] >= 16) & (availableStands['Hour'] < 0)]

In [45]:
availableStands['TimeOfDay']= np.select(conditions, choices, default='closed')

In [46]:
availableStands

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,intervals,description,temp,temp_min,temp_max,humidity,Day,Hour,Stands_available,TimeOfDay
0,61,7,18,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,18,Afternoon
1,31,16,4,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,4,Afternoon
2,80,15,25,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,25,Afternoon
3,41,14,6,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,6,Afternoon
4,50,8,32,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,32,Afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50003,31,6,14,2021-03-22 14:00:00,broken clouds,9,8,9,76,Monday,14,14,Afternoon
50004,33,8,15,2021-03-22 14:00:00,broken clouds,9,8,9,76,Monday,14,15,Afternoon
50005,115,12,18,2021-03-22 14:00:00,broken clouds,9,8,9,76,Monday,14,18,Afternoon
50006,52,11,21,2021-03-22 14:00:00,broken clouds,9,8,9,76,Monday,14,21,Afternoon


#### Create a new feature which states whether bikes are available

In [47]:
availableBikes['Hour']=availableBikes['intervals'].dt.hour
availableBikes['Bikes_available']= np.where(availableBikes['available_bikes']>0, availableBikes['available_bikes'], 0)


In [48]:
choices=['Morning', 'Afternoon', 'Evening']
conditions=[(availableBikes['Hour'] >= 7) & (availableBikes['Hour'] < 12), (availableBikes['Hour'] >= 12) & (availableBikes['Hour'] < 16),(availableBikes['Hour'] >= 16) & (availableBikes['Hour'] <= 23)]

In [49]:
availableBikes['TimeOfDay']= np.select(conditions, choices, default='Night')

In [65]:
availableBikes.head()

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,intervals,description,temp,temp_min,temp_max,humidity,Day,Hour,Bikes_available,TimeOfDay
0,61,7,18,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,7,Afternoon
1,31,16,4,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,16,Afternoon
2,80,15,25,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,15,Afternoon
3,41,14,6,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,14,Afternoon
4,50,8,32,2021-03-19 15:30:00,scattered clouds,12,10,12,76,Friday,15,8,Afternoon


In [52]:
FinalAvailBikes=availableBikes.drop(['intervals','Hour'], axis=1)

In [53]:
FinalAvailStands=availableStands.drop(['intervals', 'Hour'], axis=1)

In [60]:
FinalAvailBikes.head()

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,description,temp,temp_min,temp_max,humidity,Day,Bikes_available,TimeOfDay
0,61,7,18,scattered clouds,12,10,12,76,Friday,7,Afternoon
1,31,16,4,scattered clouds,12,10,12,76,Friday,16,Afternoon
2,80,15,25,scattered clouds,12,10,12,76,Friday,15,Afternoon
3,41,14,6,scattered clouds,12,10,12,76,Friday,14,Afternoon
4,50,8,32,scattered clouds,12,10,12,76,Friday,8,Afternoon


In [55]:
FinalAvailBikes.TimeOfDay.unique()

array(['Afternoon', 'Evening', 'Night', 'Morning'], dtype=object)

In [56]:
FinalAvailStands.head()

Unnamed: 0,StationNumber,available_bikes,available_bikes_stands,description,temp,temp_min,temp_max,humidity,Day,Stands_available,TimeOfDay
0,61,7,18,scattered clouds,12,10,12,76,Friday,18,Afternoon
1,31,16,4,scattered clouds,12,10,12,76,Friday,4,Afternoon
2,80,15,25,scattered clouds,12,10,12,76,Friday,25,Afternoon
3,41,14,6,scattered clouds,12,10,12,76,Friday,6,Afternoon
4,50,8,32,scattered clouds,12,10,12,76,Friday,32,Afternoon


In [57]:
FinalAvailBikes.to_csv('availbikes.csv', index=False)
FinalAvailStands.to_csv('availstands.csv', index=False)