# Prototype
---
Predict the number of bikes available

## Imports

### Standards

In [3]:
from pathlib import Path

### Externals

In [5]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import sklearn as sk
from sklearn import compose, ensemble, metrics, model_selection, pipeline, preprocessing

### Options

In [8]:
# display all rows/columns
pd.options.display.max_rows = None
pd.options.display.max_columns = None

# change the default theme
pio.templates.default = "plotly_white"

# force sklearn return pd.DataFrame
sk.set_config(transform_output="pandas")

## Configs

In [6]:
# seed
RANDOM = 42

In [19]:
# Define paths for caching and training data
ROOT = Path("../")
DATA = str(ROOT / "data")
CACHE = str(ROOT / ".cache")
HOUR = str(DATA + "/hour.csv")
# Configure random state for reproducibility
RANDOM = 42
# Define dataset columns for feature engineering
INDEX = "instant"
TARGET = "cnt"
# Setup dataset parameters for testing and shuffling
SPLITS = 4
SHUFFLE = False  # required (time sensitive)
TEST_SIZE = 24 * 30 * 2  # use 2 months for backtesting
# Parameters for pipeline configurations
SCORING = "neg_mean_squared_error"
PARAM_GRID = {
    "regressor__max_depth": [12, 15, 18, 21],
    "regressor__n_estimators": [150, 200, 250, 300],
}

## Dataset

In [20]:
hour = pd.read_csv(HOUR, index_col=INDEX)
print("Hour:", hour.shape)
hour.head()

Hour: (17379, 16)


Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [21]:
hour.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17379 entries, 1 to 17379
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      17379 non-null  object 
 1   season      17379 non-null  int64  
 2   yr          17379 non-null  int64  
 3   mnth        17379 non-null  int64  
 4   hr          17379 non-null  int64  
 5   holiday     17379 non-null  int64  
 6   weekday     17379 non-null  int64  
 7   workingday  17379 non-null  int64  
 8   weathersit  17379 non-null  int64  
 9   temp        17379 non-null  float64
 10  atemp       17379 non-null  float64
 11  hum         17379 non-null  float64
 12  windspeed   17379 non-null  float64
 13  casual      17379 non-null  int64  
 14  registered  17379 non-null  int64  
 15  cnt         17379 non-null  int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 2.3+ MB


In [22]:
hour.describe(include="all")

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
unique,731,,,,,,,,,,,,,,,
top,2012-12-31,,,,,,,,,,,,,,,
freq,24,,,,,,,,,,,,,,,
mean,,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0


In [25]:
# px.scatter_matrix(
#     hour,
#     dimensions=["registered", "casual", "cnt", "mnth", "hr"],
#     color=TARGET,
#     height=800,
#     title="Analysis of top features",
# )