# Building Prediction Model

## Implort to be used packeges and module

In [1]:
import pandas as pd 
import sklearn
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from models.schemas import Base, CurrentWeather, StaticBike
from config.config import MySQL
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle

## Load data

### Connect to database

In [54]:
try:
    db = create_engine(MySQL.URI)
    conn = db.connect()
except Exception as e:
    # Close connections
    conn.close()
    db.dispose()

### Retrieve data and create dataframe

In [4]:
df_AllStations = pd.read_sql_query("SELECT * FROM dublin_bike.bike_history AS b INNER JOIN dublin_bike.weather_history AS w ON b.scraping_time = w.datetime and b.number = w.stationNum", conn)
df_AllStations.head(5)

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,lon,lat,wind_spd,clouds,sunset,sunrise,pressure,humidity,code,weekday
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,-6.2681,53.3568,3.6,75.0,1614621749,1614621749,1031,87,701,1
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,-6.2699,53.3512,3.6,75.0,1614621750,1614621750,1031,87,701,1
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,-6.2699,53.3512,3.6,75.0,1614621750,1614621750,1031,87,701,1
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,-6.2602,53.3307,3.6,75.0,1614621749,1614621749,1031,87,701,1
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,-6.2701,53.3434,3.6,75.0,1614621750,1614621750,1031,87,701,1


In [5]:
df_AllStations.shape

(251424, 28)

### Change scrapping_time to datetime and create the hour and minute column

In [7]:
df_AllStations['scraping_time'] = pd.to_datetime(df_AllStations['scraping_time'])
df_AllStations['hour'] = df_AllStations['scraping_time'].dt.hour
df_AllStations['minutes'] = df_AllStations['scraping_time'].dt.minute
df_AllStations

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,wind_spd,clouds,sunset,sunrise,pressure,humidity,code,weekday,hour,minutes
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,3.60,75.0,1614621749,1614621749,1031,87,701,1,12,35
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,3.60,75.0,1614621750,1614621750,1031,87,701,1,12,35
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,3.60,75.0,1614621750,1614621750,1031,87,701,1,12,35
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,3.60,75.0,1614621749,1614621749,1031,87,701,1,12,35
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,3.60,75.0,1614621750,1614621750,1031,87,701,1,12,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251419,2021-03-09 18:48:17,112,2021-03-09 18:37:59,North Circular Road (O'Connell's),NORTH CIRCULAR ROAD (O'CONNELL'S),53.3578,-6.25156,30,18,12,...,9.26,90.0,1615313846,1615313846,1006,76,500,2,18,48
251420,2021-03-09 18:48:17,113,2021-03-09 18:38:26,Merrion Square South,MERRION SQUARE SOUTH,53.3386,-6.24861,40,28,12,...,9.26,90.0,1615313847,1615313847,1006,76,500,2,18,48
251421,2021-03-09 18:48:17,114,2021-03-09 18:43:54,Wilton Terrace (Park),WILTON TERRACE (PARK),53.3337,-6.24834,40,32,8,...,8.23,90.0,1615313846,1615313846,1007,76,804,2,18,48
251422,2021-03-09 18:48:17,115,2021-03-09 18:47:11,Killarney Street,KILLARNEY STREET,53.3548,-6.24758,30,2,28,...,9.26,90.0,1615313846,1615313846,1006,76,500,2,18,48


### Transform week data 

In [9]:
#use number to replace weekdays
df_AllStations['weekday'] = df_AllStations['weekday'].replace(1, 'Monday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(2, 'Tuesday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(3, 'Wednesday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(4, 'Thursday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(5, 'Friday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(6, 'Saturday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(7, 'Sunday')
df_AllStations

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,wind_spd,clouds,sunset,sunrise,pressure,humidity,code,weekday,hour,minutes
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,3.60,75.0,1614621749,1614621749,1031,87,701,Monday,12,35
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,3.60,75.0,1614621750,1614621750,1031,87,701,Monday,12,35
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,3.60,75.0,1614621750,1614621750,1031,87,701,Monday,12,35
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,3.60,75.0,1614621749,1614621749,1031,87,701,Monday,12,35
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,3.60,75.0,1614621750,1614621750,1031,87,701,Monday,12,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251419,2021-03-09 18:48:17,112,2021-03-09 18:37:59,North Circular Road (O'Connell's),NORTH CIRCULAR ROAD (O'CONNELL'S),53.3578,-6.25156,30,18,12,...,9.26,90.0,1615313846,1615313846,1006,76,500,Tuesday,18,48
251420,2021-03-09 18:48:17,113,2021-03-09 18:38:26,Merrion Square South,MERRION SQUARE SOUTH,53.3386,-6.24861,40,28,12,...,9.26,90.0,1615313847,1615313847,1006,76,500,Tuesday,18,48
251421,2021-03-09 18:48:17,114,2021-03-09 18:43:54,Wilton Terrace (Park),WILTON TERRACE (PARK),53.3337,-6.24834,40,32,8,...,8.23,90.0,1615313846,1615313846,1007,76,804,Tuesday,18,48
251422,2021-03-09 18:48:17,115,2021-03-09 18:47:11,Killarney Street,KILLARNEY STREET,53.3548,-6.24758,30,2,28,...,9.26,90.0,1615313846,1615313846,1006,76,500,Tuesday,18,48


In [12]:
df_AllStations['weekday'].unique()

array(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'], dtype=object)

In [18]:
# Create a separate dataframe with days of the week (categorical)
data_input = pd.DataFrame(df_AllStations['weekday'])

# Concatenate the two dataframes in the main one
dummy = pd.get_dummies(data_input)
df_AllStations = pd.concat([df_AllStations,dummy],axis=1)
df_AllStations

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday.1,weekday_Tuesday.1,weekday_Wednesday.1
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,0,0,0,0,1,0,0,0,0,0
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,0,0,0,0,1,0,0,0,0,0
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,0,0,0,0,1,0,0,0,0,0
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251419,2021-03-09 18:48:17,112,2021-03-09 18:37:59,North Circular Road (O'Connell's),NORTH CIRCULAR ROAD (O'CONNELL'S),53.3578,-6.25156,30,18,12,...,0,1,0,0,0,0,0,0,1,0
251420,2021-03-09 18:48:17,113,2021-03-09 18:38:26,Merrion Square South,MERRION SQUARE SOUTH,53.3386,-6.24861,40,28,12,...,0,1,0,0,0,0,0,0,1,0
251421,2021-03-09 18:48:17,114,2021-03-09 18:43:54,Wilton Terrace (Park),WILTON TERRACE (PARK),53.3337,-6.24834,40,32,8,...,0,1,0,0,0,0,0,0,1,0
251422,2021-03-09 18:48:17,115,2021-03-09 18:47:11,Killarney Street,KILLARNEY STREET,53.3548,-6.24758,30,2,28,...,0,1,0,0,0,0,0,0,1,0


In [16]:
df_AllStations.columns

Index(['scraping_time', 'number', 'last_update', 'address', 'site_names',
       'latitude', 'longitude', 'bike_stand', 'available_bike_stand',
       'available_bike', 'status', 'banking', 'bonus', 'localtime',
       'stationNum', 'datetime', 'temperature', 'icon', 'lon', 'lat',
       'wind_spd', 'clouds', 'sunset', 'sunrise', 'pressure', 'humidity',
       'code', 'weekday', 'hour', 'minutes', 'weekday_Friday',
       'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
       'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday',
       'weekday_Friday', 'weekday_Monday', 'weekday_Saturday',
       'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday',
       'weekday_Wednesday'],
      dtype='object')

In [13]:
df_AllStations.dtypes

scraping_time           datetime64[ns]
number                           int64
last_update                     object
address                         object
site_names                      object
latitude                       float64
longitude                      float64
bike_stand                       int64
available_bike_stand             int64
available_bike                   int64
status                          object
banking                          int64
bonus                            int64
localtime               datetime64[ns]
stationNum                       int64
datetime                datetime64[ns]
temperature                     object
icon                            object
lon                            float64
lat                            float64
wind_spd                       float64
clouds                         float64
sunset                          object
sunrise                         object
pressure                        object
humidity                 

## Create a prediction model for available bikes

### Select input and output features

In [19]:
input_model = pd.DataFrame(df_AllStations[['latitude', 'longitude','temperature','wind_spd', 'pressure', 'humidity', 'hour']])
input_model = pd.concat([input_model,dummy], axis = 1)
output = df_AllStations['available_bike']

In [22]:
input_model

Unnamed: 0,latitude,longitude,temperature,wind_spd,pressure,humidity,hour,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,53.3568,-6.26814,7.37,3.60,1031,87,12,0,1,0,0,0,0,0
1,53.3512,-6.26986,7.37,3.60,1031,87,12,0,1,0,0,0,0,0
2,53.3469,-6.27298,7.37,3.60,1031,87,12,0,1,0,0,0,0,0
3,53.3307,-6.26018,7.38,3.60,1031,87,12,0,1,0,0,0,0,0
4,53.3434,-6.27012,7.37,3.60,1031,87,12,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251419,53.3578,-6.25156,9.06,9.26,1006,76,18,0,0,0,0,0,1,0
251420,53.3386,-6.24861,9.08,9.26,1006,76,18,0,0,0,0,0,1,0
251421,53.3337,-6.24834,9.08,8.23,1007,76,18,0,0,0,0,0,1,0
251422,53.3548,-6.24758,9.07,9.26,1006,76,18,0,0,0,0,0,1,0


In [24]:
output

0          0
1          7
2          0
3         24
4          5
          ..
251419    12
251420    12
251421     8
251422    28
251423     9
Name: available_bike, Length: 251424, dtype: int64

### Split dataset to train and test 

In [27]:
X_train,X_test,Y_train,Y_test=train_test_split(input_model,output,test_size=0.2,random_state=40)
print("Will train the model on %s rows and %s columns." % X_train.shape)
print("Will test the model on %s rows and %s columns." % X_test.shape)

Will train the model on 201139 rows and 14 columns.
Will test the model on 50285 rows and 14 columns.


### Train the model

In [28]:
# Instantiate RandomForestRegressor object (calling 10 decision tree models)
model = RandomForestRegressor(n_estimators = 10)

# Train the model
model.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=10)

### Test the model

In [29]:
prediction = model.predict(X_test)

In [32]:
#make a new datafram to show the predicted available bikes
DF_Predicated = pd.DataFrame(prediction, columns=['Predicted'])

#convert all the data for testing to a new datafram
DF_Alltest = df_AllStations.iloc[Y_test]

#reset the index
DF_Bikes = pd.DataFrame(DF_Alltest['available_bike']).reset_index(drop=True)

#to get a clear comparisaon, concatenate two new datafram
actual_vs_predicted= pd.concat([DF_Bikes,DF_Predicated], axis=1)
actual_vs_predicted['difference'] = actual_vs_predicted['Predicted'] - actual_vs_predicted['available_bike']
actual_vs_predicted

Unnamed: 0,available_bike,Predicted,difference
0,0,0.0,0.0
1,15,9.0,-6.0
2,3,10.0,7.0
3,7,1.0,-6.0
4,15,7.8,-7.2
...,...,...,...
50280,12,11.0,-1.0
50281,6,12.0,6.0
50282,5,3.5,-1.5
50283,12,7.7,-4.3


### Evaluation

In [36]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print("MAE (Mean Absolute Error): ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE (Mean Squared Error): ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE (Root Mean Squared Error): ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))
printMetrics(Y_test, prediction)

MAE (Mean Absolute Error):  0.32431615791162405
MSE (Mean Squared Error):  0.5973902217288007
RMSE (Root Mean Squared Error):  0.7729102287645059
R2:  0.9888207888278333


In [37]:
pickle.dump(model,open('flask_app/bike_prediction_model.pickle', 'wb'))

## Create prediction model for available stands

### Select input and output features

In [38]:
input_model = pd.DataFrame(df_AllStations[['latitude', 'longitude','temperature','wind_spd', 'pressure', 'humidity', 'hour']])
input_model = pd.concat([input_model,dummy],axis=1)
output = df_AllStations['available_bike_stand']

### Split dataset to train and test 

In [40]:
X_train,X_test,Y_train,Y_test=train_test_split(input_model,output,test_size=0.2,random_state=40)
print("Will train the model on %s rows and %s columns." % X_train.shape)
print("Will test the model on %s rows and %s columns." % X_test.shape)

Will train the model on 201139 rows and 14 columns.
Will test the model on 50285 rows and 14 columns.


### Train the model

In [41]:
# Instantiate RandomForestRegressor object calling 10 decision tree models
model = RandomForestRegressor(n_estimators=10)

# Train the model
model.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=10)

In [42]:
prediction = model.predict(X_test)

In [47]:
#make a new datafram to show the predicted available bikes
DF_Predicated = pd.DataFrame(prediction, columns=['Predicted'])

#convert all the data for testing to a new datafram
DF_Alltest = df_AllStations.iloc[Y_test]

#reset the index
DF_Bikes = pd.DataFrame(DF_Alltest['available_bike_stand']).reset_index(drop=True)

#to get a clear comparisaon, concatenate two new datafram
actual_vs_predicted= pd.concat([DF_Bikes,DF_Predicated], axis=1)
actual_vs_predicted['difference'] = actual_vs_predicted['Predicted'] - actual_vs_predicted['available_bike_stand']
actual_vs_predicted

Unnamed: 0,available_bike_stand,Predicted,difference
0,21,40.0,19.0
1,25,31.0,6.0
2,19,20.0,1.0
3,20,19.0,-1.0
4,25,32.3,7.3
...,...,...,...
50280,20,19.0,-1.0
50281,4,28.0,24.0
50282,9,16.3,7.3
50283,13,14.2,1.2


### Evaluation

In [48]:
printMetrics(Y_test, prediction)

MAE (Mean Absolute Error):  0.3410153120479803
MSE (Mean Squared Error):  0.883674591286257
RMSE (Root Mean Squared Error):  0.940039675378788
R2:  0.9891759975371193


In [49]:
pickle.dump(model,open('flask_app/stand_prediction_model.pickle', 'wb'))

In [50]:
conn.close()
db.dispose()