## Prediction of bike rental count hourly

In [1]:
import pandas as pd

df = pd.read_csv('hour.csv')
df.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit :
        - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
        - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
        - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
        - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

Predication of bike rental count daily based on the **environmental** and **seasonal** settings.

In [2]:
y = df['cnt'].values

In [3]:
X = df.loc[:, 'season':'windspeed'].values

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

For using regression algorithms we are going to scale data into 0, 1 segment.

In [5]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()
sc.fit(X_train)
X_test = sc.transform(X_test)
X_train = sc.transform(X_train)

## LinearRegression

In [6]:
from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
lr.coef_

array([  65.83022086,   83.44855049,   -2.49676759,  173.83028061,
        -25.1769233 ,    9.17011476,    4.33791961,   -8.47641847,
         85.90236835,  211.50245712, -201.86933871,   31.50171543])

In [8]:
print(f'Training accuracy {lr.score(X_train, y_train)}')
print(f'Test accuracy {lr.score(X_test, y_test)}')

Training accuracy 0.38714933403807383
Test accuracy 0.3913916312859003


In [9]:
from sklearn.metrics import mean_squared_error

mean_squared_error(lr.predict(X_test), y_test)

19415.489630642347

## SGDRegressor
Here we use stochastic gradient descent with L1 regularization to remove redundant features.

In [44]:
from sklearn.linear_model import SGDRegressor

lrsgd = SGDRegressor(alpha=0.01, max_iter=10000, penalty='l2', loss='squared_loss')
lrsgd.fit(X_train, y_train)

SGDRegressor(alpha=0.01, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=10000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [45]:
lrsgd.coef_

array([  58.21242577,   81.0013939 ,    3.04277531,  165.21525476,
        -18.6809341 ,    9.10712142,    5.1438981 ,  -24.37290059,
        135.02718779,  120.86680474, -156.00479856,   31.12892423])

In [46]:
print(f'Training accuracy {lrsgd.score(X_train, y_train)}')
print(f'Test accuracy {lrsgd.score(X_test, y_test)}')

Training accuracy 0.38244775290341243
Test accuracy 0.38693627272837083


In [43]:
from sklearn.metrics import mean_squared_error

mean_squared_error(lrsgd.predict(X_test), y_test)

19404.558130285845

## MLPRegressor

In [14]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=20000)
mlp.fit(X_train, y_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(200,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=20000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [15]:
print(f'Training accuracy {mlp.score(X_train, y_train)}')
print(f'Test accuracy {mlp.score(X_test, y_test)}')

Training accuracy 0.6504687696590317
Test accuracy 0.6499943388840215


In [16]:
from sklearn.metrics import mean_squared_error

mean_squared_error(mlp.predict(X_test), y_test)

11165.68820507901