In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
def extract_from_date(df):
    
    # extract year
    df['year'] = df['start_date'].dt.year
    # extract month
    df['month'] = df['start_date'].dt.month
    # extract day
    df['day'] = df['start_date'].dt.day
    # extract day_of_week (0 monday - 6 sunday)
    df['day_of_week']=df['start_date'].dt.dayofweek
    # extract hour
    df['hour']= df['start_date'].dt.hour
    
    # extract workday boolean
    df['day_type'] = np.where(df['day_of_week']<5, 'workday', 'weekend')
    
    # extract season
    conditions_weather = [
        df['month'].isin([1,2,12]),
        df['month'].isin([3,4,5]),
        df['month'].isin([6,7,8]),
        df['month'].isin([9,10,11])
    ]
    choices = ['winter', 'spring', 'summer', 'fall']
    df['season'] = np.select(conditions_weather, choices)
    
    # extract day period
    conditions_period = [
        df['hour'].isin([0,1,2,3,4,5]),
        df['hour'].isin([6,7,8,9,10,11]),
        df['hour'].isin([12,13,14,15,16,17]),
        df['hour'].isin([18,19,20,21,22,23])
    ]
    choices = ['night', 'morning', 'afternoon', 'evening']
    df['day_period'] = np.select(conditions_period, choices)
    
    return df[df['year'] != 2015]

In [3]:
london_df = pd.read_csv('weather_bike_usage.csv')

In [4]:
london_df['start_date'] = pd.to_datetime(london_df['start_date'], format ="%d/%m/%Y %H:%M")

In [5]:
london_processed = extract_from_date(london_df)

In [6]:
london_processed

Unnamed: 0,start_date,StartStation Name,n_bike_rented,temp,feels_like,temp_min,temp_max,humidity,wind_speed,weather_main,year,month,day,day_of_week,hour,day_type,season,day_period
0,2013-01-01 00:00:00,Aldgate,2,9.15,5.60,8.10,10.0,87,4.1,Clouds,2013,1,1,1,0,workday,winter,night
1,2013-01-01 00:00:00,Angel,1,9.15,5.60,8.10,10.0,87,4.1,Clouds,2013,1,1,1,0,workday,winter,night
2,2013-01-01 00:00:00,Bangla Town,2,9.15,5.60,8.10,10.0,87,4.1,Clouds,2013,1,1,1,0,workday,winter,night
3,2013-01-01 00:00:00,Bank,10,9.15,5.60,8.10,10.0,87,4.1,Clouds,2013,1,1,1,0,workday,winter,night
4,2013-01-01 00:00:00,Bankside,17,9.15,5.60,8.10,10.0,87,4.1,Clouds,2013,1,1,1,0,workday,winter,night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2171527,2014-12-31 23:00:00,Weavers,3,5.16,0.47,4.65,6.0,75,4.1,Clear,2014,12,31,2,23,workday,winter,evening
2171528,2014-12-31 23:00:00,West Chelsea,6,5.16,0.47,4.65,6.0,75,4.1,Clear,2014,12,31,2,23,workday,winter,evening
2171529,2014-12-31 23:00:00,West End,10,5.16,0.47,4.65,6.0,75,4.1,Clear,2014,12,31,2,23,workday,winter,evening
2171530,2014-12-31 23:00:00,Westminster,23,5.16,0.47,4.65,6.0,75,4.1,Clear,2014,12,31,2,23,workday,winter,evening


# One hot encoding

In [7]:
l_object = london_processed[["StartStation Name", "weather_main", "season", "day_type", "day_period","day_of_week"]]

In [8]:
l_object

Unnamed: 0,StartStation Name,weather_main,season,day_type,day_period,day_of_week
0,Aldgate,Clouds,winter,workday,night,1
1,Angel,Clouds,winter,workday,night,1
2,Bangla Town,Clouds,winter,workday,night,1
3,Bank,Clouds,winter,workday,night,1
4,Bankside,Clouds,winter,workday,night,1
...,...,...,...,...,...,...
2171527,Weavers,Clear,winter,workday,evening,2
2171528,West Chelsea,Clear,winter,workday,evening,2
2171529,West End,Clear,winter,workday,evening,2
2171530,Westminster,Clear,winter,workday,evening,2


In [10]:
ohe = OneHotEncoder()
ohe.fit(l_object)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [11]:
codes = ohe.transform(l_object).toarray()
feature_names = ohe.get_feature_names(["StartStation Name", "weather_main", "season", "day_type", "day_period","day_of_week"])

In [30]:
london_processed = pd.concat([london_processed[['start_date', 'n_bike_rented', 'temp', 'feels_like', 'temp_min', 'temp_max', 'humidity', 'wind_speed', 'year', 'month', 'day', 'hour']].reset_index(drop=True), 
               pd.DataFrame(codes,columns=feature_names).reset_index(drop=True)], axis=1)

In [7]:
london_processed = pd.get_dummies(#london_processed, columns = ["StartStation Name", "weather_main", "season", "day_type", "day_period","day_of_week"])

# Handle cyclic features

In [32]:
london_processed['hour_sin'] = np.sin(2 * np.pi * london_processed['hour'] / 23)
london_processed['hour_cos'] = np.cos(2 * np.pi * london_processed['hour'] / 23)

london_processed['month_sin'] = np.sin(2 * np.pi * london_processed['month'] / 12)
london_processed['month_cos'] = np.cos(2 * np.pi * london_processed['month'] / 12)

# Remove columns

In [33]:
london_processed = london_processed.drop(columns=['start_date', 'year', 'month', 'day', 'hour'])

# Scaling values

In [35]:
scaler = StandardScaler()

In [36]:
scaler.fit(london_processed[["feels_like", "humidity", "temp", "temp_max", "temp_min", "wind_speed"]])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [37]:
london_processed[["feels_like", "humidity", "temp", "temp_max", "temp_min", "wind_speed"]] = scaler.transform(london_processed[["feels_like", "humidity", "temp", "temp_max", "temp_min", "wind_speed"]])

In [38]:
london_processed

Unnamed: 0,n_bike_rented,temp,feels_like,temp_min,temp_max,humidity,wind_speed,StartStation Name_Aldgate,StartStation Name_Angel,StartStation Name_Avondale,...,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,hour_sin,hour_cos,month_sin,month_cos
0,2,-0.454163,-0.392585,-0.379681,-0.530098,0.952362,-0.03999,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.0,5.000000e-01,0.866025
1,1,-0.454163,-0.392585,-0.379681,-0.530098,0.952362,-0.03999,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.0,5.000000e-01,0.866025
2,2,-0.454163,-0.392585,-0.379681,-0.530098,0.952362,-0.03999,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.0,5.000000e-01,0.866025
3,10,-0.454163,-0.392585,-0.379681,-0.530098,0.952362,-0.03999,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.0,5.000000e-01,0.866025
4,17,-0.454163,-0.392585,-0.379681,-0.530098,0.952362,-0.03999,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,1.0,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2166121,3,-1.123005,-1.129050,-0.974321,-1.171250,0.152548,-0.03999,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-2.449294e-16,1.0,-2.449294e-16,1.000000
2166122,6,-1.123005,-1.129050,-0.974321,-1.171250,0.152548,-0.03999,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-2.449294e-16,1.0,-2.449294e-16,1.000000
2166123,10,-1.123005,-1.129050,-0.974321,-1.171250,0.152548,-0.03999,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-2.449294e-16,1.0,-2.449294e-16,1.000000
2166124,23,-1.123005,-1.129050,-0.974321,-1.171250,0.152548,-0.03999,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-2.449294e-16,1.0,-2.449294e-16,1.000000


# Regression

In [68]:
used_portion = london_processed.sample(n=400000)

y = used_portion['n_bike_rented']
X = used_portion.iloc[:, 1:]

In [39]:
y = london_processed['n_bike_rented']
X = london_processed.iloc[:, 1:]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [41]:
from tensorflow import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


In [42]:
def build_and_compile_model():
    model = keras.Sequential([
        keras.layers.Dense(64, input_dim = 175, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [43]:
dnn_model = build_and_compile_model()

In [44]:
history = dnn_model.fit(
    X_train, y_train,
    validation_split=0.2,
    verbose=True, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [45]:
test_result = dnn_model.evaluate(X_test, y_test, verbose=1)



In [46]:
test_result

4.2863688468933105

In [47]:
test_predictions = dnn_model.predict(X_test).flatten()

In [48]:
test_predictions = [int(elem) for elem in test_predictions]

In [49]:
y_true = list(y_test)

In [50]:
rmse = mean_squared_error(y_true, test_predictions, squared=False)
print("RMSE", rmse)
mae = mean_absolute_error(y_true, test_predictions)
print("MAE", mae)

RMSE 8.66249744550016
MAE 4.3045928369839865


In [None]:
#for i in range(len(y_true)):
#    print("True value: {0}\tPredicted:{1}".format(y_true[i], test_predictions[i]))

In [52]:
dnn_model.save('dnn_regr')

INFO:tensorflow:Assets written to: dnn_regr/assets


In [53]:
import pickle
with open('encoder.pickle', 'wb') as f:
    pickle.dump(ohe, f)

with open('scaler.pickle', 'wb') as f:
    pickle.dump(scaler, f)