## Count Data Modeling 

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit

## Bike Sharing Demand dataset

In [4]:
from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml(
    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
df = bike_sharing.frame

**The target of the prediction problem is the absolute count of bike rentals on a hourly basis:**

In [55]:
sns.histplot(df['count']);
plt.title('Count data distribution');

<img src='./plots/Count-data-distribution.png'>


## There is a Weekly pattern

In [26]:
ax = df.groupby(['weekday','hour'])['count'].agg(['mean']).plot(y=['mean'])
ax.set(
    title="Average hourly bike demand during the week",
    xticks=[i * 24 for i in range(7)],
    xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
    xlabel="Time of the week",
    ylabel="Number of bike rentals",
);

<img src='./plots/weekly-pattern.png'>

#### The target of the prediction problem is the absolute count of bike rentals on a hourly basis:

#### Let us rescale the target variable (number of hourly bike rentals) to predict a relative demand so that the mean absolute error is more easily interpreted as a fraction of the maximum demand.

## Feature and label

In [5]:
y = df['count']/df['count'].max()

y_count = df.pop('count')

X = df

X.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
0,spring,0,1,0,False,6,False,clear,9.84,14.395,0.81,0.0
1,spring,0,1,1,False,6,False,clear,9.02,13.635,0.8,0.0
2,spring,0,1,2,False,6,False,clear,9.02,13.635,0.8,0.0
3,spring,0,1,3,False,6,False,clear,9.84,14.395,0.75,0.0
4,spring,0,1,4,False,6,False,clear,9.84,14.395,0.75,0.0


In [80]:
for col in X.select_dtypes(include='category'):
    print(X[col].nunique(),  X[col].dtype, 'in', col)
    [print(f'{k !r:10s}', v) for k,v in X[col].value_counts().items()]


4 category in season
'fall'     4496
'summer'   4409
'spring'   4242
'winter'   4232
2 category in holiday
'False'    16879
'True'     500
2 category in workingday
'True'     11865
'False'    5514
4 category in weather
'clear'    11413
'misty'    4544
'rain'     1419
'heavy_rain' 3


#### Since there are only `3` "heavy_rain" events, we cannot use this category to train machine learning models with cross validation. Instead, we simplify the representation by collapsing those into the "rain" category.

In [6]:
X['weather'] = X['weather'].replace(to_replace='heavy_rain', value='rain')
X['weather'].value_counts()

clear    11413
misty     4544
rain      1422
Name: weather, dtype: int64

## Time-based cross-validation

Since the dataset is a time-ordered event log (hourly demand), we will use a time-sensitive cross-validation splitter to evaluate our demand forecasting model as realistically as possible. 

* We use a gap of 2 days between the train and test side of the splits. 
* We also limit the training set size to make the performance of the CV folds more stable.

In [11]:
tscv = TimeSeriesSplit(
    n_splits=5,
    gap=48, #2day gap
    max_train_size=10000,
    test_size=1000
)

In [12]:
all_splits = list(tscv.split(X,y))

train_id_0, test_id_0 = all_splits[0]
train_id_1, test_id_1 = all_splits[1]
train_id_2, test_id_2 = all_splits[2]
train_id_3, test_id_3 = all_splits[3]
train_id_4, test_id_4 = all_splits[4]

## Inspect the splits 

In [13]:
X.iloc[train_id_0]

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
2331,summer,0,4,1,False,2,True,misty,25.42,31.060,0.50,6.0032
2332,summer,0,4,2,False,2,True,misty,24.60,31.060,0.53,8.9981
2333,summer,0,4,3,False,2,True,misty,23.78,27.275,0.56,8.9981
2334,summer,0,4,4,False,2,True,misty,22.96,26.515,0.64,8.9981
2335,summer,0,4,5,False,2,True,misty,22.14,25.760,0.68,8.9981
...,...,...,...,...,...,...,...,...,...,...,...,...
12326,summer,1,6,19,False,6,False,clear,26.24,31.060,0.36,11.0014
12327,summer,1,6,20,False,6,False,clear,25.42,31.060,0.35,19.0012
12328,summer,1,6,21,False,6,False,clear,24.60,31.060,0.40,7.0015
12329,summer,1,6,22,False,6,False,clear,23.78,27.275,0.46,8.9981


In [14]:
X.iloc[test_id_0]

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
12379,summer,1,6,0,False,2,True,clear,22.14,25.760,0.68,27.9993
12380,summer,1,6,1,False,2,True,misty,21.32,25.000,0.77,22.0028
12381,summer,1,6,2,False,2,True,rain,21.32,25.000,0.72,19.9995
12382,summer,1,6,3,False,2,True,rain,20.50,24.240,0.82,12.9980
12383,summer,1,6,4,False,2,True,rain,20.50,24.240,0.82,12.9980
...,...,...,...,...,...,...,...,...,...,...,...,...
13374,fall,1,7,11,False,1,True,clear,34.44,40.150,0.53,15.0013
13375,fall,1,7,12,False,1,True,clear,34.44,39.395,0.49,8.9981
13376,fall,1,7,13,False,1,True,clear,34.44,39.395,0.49,19.0012
13377,fall,1,7,14,False,1,True,clear,36.08,40.910,0.42,7.0015


In [15]:
X.iloc[train_id_1]

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
3331,summer,0,5,17,False,1,True,rain,29.52,34.090,0.70,26.0027
3332,summer,0,5,18,False,1,True,clear,28.70,33.335,0.74,19.0012
3333,summer,0,5,19,False,1,True,clear,28.70,33.335,0.74,15.0013
3334,summer,0,5,20,False,1,True,clear,26.24,28.790,0.83,15.0013
3335,summer,0,5,21,False,1,True,clear,26.24,28.790,0.89,15.0013
...,...,...,...,...,...,...,...,...,...,...,...,...
13326,fall,1,7,11,False,6,False,misty,29.52,34.090,0.70,7.0015
13327,fall,1,7,12,False,6,False,misty,30.34,34.850,0.66,6.0032
13328,fall,1,7,13,False,6,False,misty,31.16,35.605,0.62,6.0032
13329,fall,1,7,14,False,6,False,clear,32.80,37.880,0.55,7.0015


In [16]:
X.iloc[test_id_1]

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
13379,fall,1,7,16,False,1,True,rain,31.16,36.365,0.66,39.0007
13380,fall,1,7,17,False,1,True,rain,31.16,36.365,0.66,39.0007
13381,fall,1,7,18,False,1,True,clear,34.44,40.150,0.53,0.0000
13382,fall,1,7,19,False,1,True,clear,32.80,39.395,0.63,8.9981
13383,fall,1,7,20,False,1,True,clear,32.80,38.635,0.59,8.9981
...,...,...,...,...,...,...,...,...,...,...,...,...
14374,fall,1,8,3,False,1,True,clear,25.42,28.030,0.88,6.0032
14375,fall,1,8,4,False,1,True,clear,25.42,28.790,0.83,0.0000
14376,fall,1,8,5,False,1,True,clear,25.42,28.030,0.88,0.0000
14377,fall,1,8,6,False,1,True,clear,25.42,28.030,0.88,0.0000
