In [1]:
import pandas as pd 
import sklearn
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from models.schemas import Base, CurrentWeather, StaticBike
from config.config import MySQL
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle

In [2]:
try:
    db = create_engine(MySQL.URI)
    conn = db.connect()
except Exception as e:
    # Close connections
    conn.close()
    db.dispose()
    sys.exit(e)

In [3]:
# Create dataframe and store data running SQL query
df_AllStations = pd.read_sql_query("SELECT * FROM dublin_bike.bike_history AS b INNER JOIN dublin_bike.weather_history AS w ON b.scraping_time = w.datetime and b.number = w.stationNum", conn)

# Examine dataframe object, show first 10 rows
df_AllStations.head(5)

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,lon,lat,wind_spd,clouds,sunset,sunrise,pressure,humidity,code,weekday
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,-6.2681,53.3568,3.6,75.0,1614621749,1614621749,1031,87,701,1
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,-6.2699,53.3512,3.6,75.0,1614621750,1614621750,1031,87,701,1
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,-6.2699,53.3512,3.6,75.0,1614621750,1614621750,1031,87,701,1
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,-6.2602,53.3307,3.6,75.0,1614621749,1614621749,1031,87,701,1
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,-6.2701,53.3434,3.6,75.0,1614621750,1614621750,1031,87,701,1


In [4]:
df_AllStations.shape

(251424, 28)

In [5]:
#create column for day of the week
df_AllStations['scraping_time'] = pd.to_datetime(df_AllStations['scraping_time'])
df_AllStations['hour'] = df_AllStations['scraping_time'].dt.hour
df_AllStations['minutes'] = df_AllStations['scraping_time'].dt.minute

In [6]:
#use number to replace weekdays
df_AllStations['weekday'] = df_AllStations['weekday'].replace(1, 'Monday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(2, 'Tuesday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(3, 'Wednesday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(4, 'Thursday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(5, 'Friday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(6, 'Saturday')
df_AllStations['weekday'] = df_AllStations['weekday'].replace(7, 'Sunday')

In [7]:
df_AllStations['weekday'].unique()

array(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'], dtype=object)

In [8]:
# Create a separate dataframe with days of the week (categorical)
data_input = pd.DataFrame(df_AllStations['weekday'])

# Concatenate the two dataframes in the main one
dummy = pd.get_dummies(data_input)
df_AllStations = pd.concat([df_AllStations,dummy],axis=1)

In [9]:
df_AllStations.columns

Index(['scraping_time', 'number', 'last_update', 'address', 'site_names',
       'latitude', 'longitude', 'bike_stand', 'available_bike_stand',
       'available_bike', 'status', 'banking', 'bonus', 'localtime',
       'stationNum', 'datetime', 'temperature', 'icon', 'lon', 'lat',
       'wind_spd', 'clouds', 'sunset', 'sunrise', 'pressure', 'humidity',
       'code', 'weekday', 'hour', 'minutes', 'weekday_Friday',
       'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
       'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday'],
      dtype='object')

In [10]:
df_AllStations.head()

Unnamed: 0,scraping_time,number,last_update,address,site_names,latitude,longitude,bike_stand,available_bike_stand,available_bike,...,weekday,hour,minutes,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,2021-03-01 12:35:29,2,2021-03-01 12:27:15,Blessington Street,BLESSINGTON STREET,53.3568,-6.26814,20,19,0,...,Monday,12,35,0,1,0,0,0,0,0
1,2021-03-01 12:35:29,3,2021-03-01 12:27:13,Bolton Street,BOLTON STREET,53.3512,-6.26986,20,13,7,...,Monday,12,35,0,1,0,0,0,0,0
2,2021-03-01 12:35:29,4,,Greek Street,GREEK STREET,53.3469,-6.27298,20,0,0,...,Monday,12,35,0,1,0,0,0,0,0
3,2021-03-01 12:35:29,5,2021-03-01 12:28:02,Charlemont Street,CHARLEMONT PLACE,53.3307,-6.26018,40,16,24,...,Monday,12,35,0,1,0,0,0,0,0
4,2021-03-01 12:35:29,6,2021-03-01 12:34:02,Christchurch Place,CHRISTCHURCH PLACE,53.3434,-6.27012,20,15,5,...,Monday,12,35,0,1,0,0,0,0,0


In [11]:
df_AllStations.dtypes

scraping_time           datetime64[ns]
number                           int64
last_update                     object
address                         object
site_names                      object
latitude                       float64
longitude                      float64
bike_stand                       int64
available_bike_stand             int64
available_bike                   int64
status                          object
banking                          int64
bonus                            int64
localtime               datetime64[ns]
stationNum                       int64
datetime                datetime64[ns]
temperature                     object
icon                            object
lon                            float64
lat                            float64
wind_spd                       float64
clouds                         float64
sunset                          object
sunrise                         object
pressure                        object
humidity                 

# Create Prediction Model for Available Bikes

In [13]:
# Select model features and store them in a new dataframe
input_model = pd.DataFrame(df_AllStations[['latitude', 'longitude','temperature','wind_spd', 'pressure', 'humidity', 'hour']])
input_model = pd.concat([input_model,dummy],axis=1)

# Define target variable
output = df_AllStations['available_bike']

In [14]:
# Split dataset to train and test
X_train,X_test,Y_train,Y_test=train_test_split(input_model,output,test_size=0.2,random_state=40)
print("Training the model on %s rows and %s columns." % X_train.shape)

Training the model on 201139 rows and 14 columns.


In [15]:
# Instantiate RandomForestRegressor object calling 10 decision tree models
model = RandomForestRegressor(n_estimators=10)

# Train the model
model.fit(X_train, Y_train)

print("Testing the model on %s rows." % Y_test.shape[0])

Testing the model on 50285 rows.


In [16]:
prediction = model.predict(X_test)

In [17]:
#make a new datafram to show the predicted available bikes
DF_Predicated = pd.DataFrame(prediction, columns=['Predicted'])

#convert all the data for testing to a new datafram
DF_Alltest = df_AllStations.iloc[Y_test]

#reset the index
DF_Bikes = pd.DataFrame(DF_Alltest['available_bike']).reset_index(drop=True)

#to get a clear comparisaon, concatenate two new datafram
actual_vs_predicted= pd.concat([DF_Bikes,DF_Predicated], axis=1)
actual_vs_predicted['difference'] = actual_vs_predicted['Predicted'] - actual_vs_predicted['available_bike']
actual_vs_predicted

Unnamed: 0,available_bike,Predicted,difference
0,0,0.0,0.0
1,15,8.8,-6.2
2,3,10.0,7.0
3,7,1.0,-6.0
4,15,8.4,-6.6
...,...,...,...
50280,12,11.0,-1.0
50281,6,12.0,6.0
50282,5,3.5,-1.5
50283,12,8.0,-4.0


In [18]:
def printMetrics(testActualVal, predictions):
    #classification evaluation measures
    print('Error Evaluation')
    print('==============================================================================')
    print("MAE (Mean Absolute Error): ", metrics.mean_absolute_error(testActualVal, predictions))
    print("MSE (Mean Squared Error): ", metrics.mean_squared_error(testActualVal, predictions))
    print("RMSE (Root Mean Squared Error): ", metrics.mean_squared_error(testActualVal, predictions)**0.5)
    print("R2: ", metrics.r2_score(testActualVal, predictions))

In [19]:
printMetrics(Y_test, prediction)

Error Evaluation
MAE (Mean Absolute Error):  0.3256346513972107
MSE (Mean Squared Error):  0.5885623286010383
RMSE (Root Mean Squared Error):  0.7671781596220256
R2:  0.9889859888560409


In [20]:
pickle.dump(model,open('flask_app/bike_prediction_model.pickle', 'wb'))

# Create Prediction Model for Available Stands

In [34]:
# Select model features and store them in a new dataframe
input_model = pd.DataFrame(df_AllStations[['latitude', 'longitude','temperature','wind_spd', 'pressure', 'humidity', 'hour']])
input_model = pd.concat([input_model,dummy],axis=1)

# Define target variable
output = df_AllStations['available_bike']

In [35]:
# Split dataset to train and test
X_train,X_test,Y_train,Y_test=train_test_split(input_model,output,test_size=0.2,random_state=40)
print("Training the model on %s rows and %s columns." % X_train.shape)

Training the model on 201139 rows and 14 columns.


In [36]:
# Instantiate RandomForestRegressor object calling 10 decision tree models
model = RandomForestRegressor(n_estimators=10)

# Train the model
model.fit(X_train, Y_train)

print("Testing the model on %s rows." % Y_test.shape[0])

Testing the model on 50285 rows.


In [37]:
prediction = model.predict(X_test)

In [38]:
#make a new datafram to show the predicted available bikes
DF_Predicated = pd.DataFrame(prediction, columns=['Predicted'])

#convert all the data for testing to a new datafram
DF_Alltest = df_AllStations.iloc[Y_test]

#reset the index
DF_Bikes = pd.DataFrame(DF_Alltest['available_bike']).reset_index(drop=True)

#to get a clear comparisaon, concatenate two new datafram
actual_vs_predicted= pd.concat([DF_Bikes,DF_Predicated], axis=1)
actual_vs_predicted['difference'] = actual_vs_predicted['Predicted'] - actual_vs_predicted['available_bike']
actual_vs_predicted

Unnamed: 0,available_bike,Predicted,difference
0,0,0.0,0.0
1,15,9.0,-6.0
2,3,10.0,7.0
3,7,1.0,-6.0
4,15,8.4,-6.6
...,...,...,...
50280,12,11.0,-1.0
50281,6,12.0,6.0
50282,5,4.0,-1.0
50283,12,8.0,-4.0


In [39]:
printMetrics(Y_test, prediction)

Error Evaluation
MAE (Mean Absolute Error):  0.3238532553986137
MSE (Mean Squared Error):  0.5885249579277025
RMSE (Root Mean Squared Error):  0.7671538033065485
R2:  0.9889866881889622


In [40]:
pickle.dump(model,open('flask_app/stand_prediction_model.pickle', 'wb'))

In [41]:
conn.close()
db.dispose()

In [42]:
model_load = pickle.load(open('flask_app/stand_prediction_model.pickle', "rb"))
prediction_load = model.predict(X_test)
prediction_load

array([ 0. ,  9. , 10. , ...,  4. ,  8. ,  2.3])

In [43]:
X_test.dtypes

latitude             float64
longitude            float64
temperature           object
wind_spd             float64
pressure              object
humidity              object
hour                   int64
weekday_Friday         uint8
weekday_Monday         uint8
weekday_Saturday       uint8
weekday_Sunday         uint8
weekday_Thursday       uint8
weekday_Tuesday        uint8
weekday_Wednesday      uint8
dtype: object

In [44]:
X_test

Unnamed: 0,latitude,longitude,temperature,wind_spd,pressure,humidity,hour,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
26328,53.3466,-6.29692,1.29,1.94,1029,100,9,0,0,0,0,0,1,0
139957,53.3589,-6.28034,0.42,1.03,1034,86,2,0,0,1,0,0,0,0
106827,53.3341,-6.26544,5.35,5.66,1030,81,0,1,0,0,0,0,0,0
172391,53.3413,-6.25812,0.88,2.57,1029,75,4,0,0,0,1,0,0,0
181199,53.3478,-6.29243,7.16,2.57,1028,61,11,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165693,53.3394,-6.24655,0.78,2.06,1031,75,23,0,0,1,0,0,0,0
168111,53.3378,-6.26770,0.51,2.06,1030,75,1,0,0,0,1,0,0,0
97006,53.3398,-6.25199,5.19,5.66,1028,75,17,0,0,0,0,1,0,0
128917,53.3416,-6.29719,2.57,2.06,1034,65,18,1,0,0,0,0,0,0


In [54]:
data = {'latitude':[93],
        'longitude':[93],
        'temperature':[1.29],
        'wind_spd':[1.94],
        'pressure':[9],
        'humidity':[9],
        'hour':[9],
        'weekday_Friday':[1],
        'weekday_Monday':[0],
        'weekday_Saturday':[0],
        'weekday_Sunday':[0],
        'weekday_Thursday':[0],
        'weekday_Tuesday':[0],
        'weekday_Wednesday':[0]        
       } 
data
df = pd.DataFrame(data)
df
prediction_load = model.predict(df)
prediction_load

array([21.3])