In [1]:
import sys
import sklearn

# Common imports
import numpy as np
import os
import pandas as pd

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
PROJECT_ID = "BikeSharing"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [11]:
BIKE_PATH = os.path.join("datasets", "BikeSharing")

def load_bike_data(filename, bike_path=BIKE_PATH):
    csv_path = os.path.join(bike_path, filename)
    return pd.read_csv(csv_path)

In [12]:
train_data = load_bike_data("train.csv")
test_data = load_bike_data("test.csv")

In [13]:
train_data.size, test_data.size

(130632, 58437)

In [14]:
train_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [15]:
#no missing data, all of them numerical except datetime
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [16]:
train_data.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


In [37]:
train_data['datetime']=pd.to_datetime(train_data['datetime'])
train_data['hour']=train_data['datetime'].apply(lambda x: x.hour)

In [45]:
y=train_data['count']
X=train_data.copy().drop(['casual','registered','count','temp','datetime'], axis=1)

bike_num=X[['atemp','humidity','windspeed', 'hour']]
bike_cat=X[['season','holiday','workingday','weather']]

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler # for standardisation of data, another option would be the MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

#Create a pipeline for numerical attributes
num_pipeline=Pipeline([
    ('std_scaler',StandardScaler()),
])

cat_pipeline = Pipeline([
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

from sklearn.pipeline import FeatureUnion #Concatenates results of multiple transformer objects.

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

from sklearn.compose import ColumnTransformer

full_pipeline=ColumnTransformer([
   ("num_pipeline",num_pipeline,['atemp','humidity','windspeed']),
    ("cat_pipeline",cat_pipeline,['season','holiday','workingday','weather','hour'])
])

In [76]:
X_prepared=preprocess_pipeline.fit_transform(X)
X_prepared_2=full_pipeline.fit_transform(X)

In [77]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

forest_reg=RandomForestRegressor()

#Evaluate performance
scores=cross_val_score(forest_reg, X_prepared_2,y, scoring='neg_mean_squared_log_error', cv=10)
forest_score=np.sqrt(-scores)

In [78]:
forest_score

array([1.01504611, 0.7442686 , 0.54019532, 0.48918491, 0.68748496,
       0.74520387, 0.5682251 , 0.47666573, 0.46109112, 0.50048231])

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_prepared_2, y, test_size=0.15, random_state=42)

In [82]:
from sklearn.model_selection import GridSearchCV

param_grid=[
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}
]

grid_search=GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_log_error', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_log_error')

In [83]:
final_model=grid_search.best_estimator_

In [84]:
y_predictions=final_model.predict(X_test)

In [85]:
mean_squared_log_error(y_predictions,y_test)

0.30806136112934746