  
# ***The task at hand: bike demand forecasting***  
 
  


 **Introduction**  

 Our goal is to predict the volume of bike rentals on an hourly basis.  
 To do that, we have some data about the season, weather, and day of the week.  
 To analyze our model in production, we will use `Evidently`. It is an open-source tool that generates interactive pre-built reports on model performance.

## Data drift dashboard in jupyter notebook

In [80]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io

from datetime import datetime
from sklearn import datasets, ensemble

from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.dashboard.tabs import DataDriftTab, NumTargetDriftTab, RegressionPerformanceTab

from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection


model profiles are deprecated, use metrics instead


'import evidently.profile_sections' is deprecated, use 'import evidently.model_profile.sections'



In [25]:
# content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
# with zipfile.ZipFile(io.BytesIO(content)) as arc:
#     raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')

* **Load Dataset**

In [26]:
raw_data = pd.read_csv('./train.csv', header=0, sep=',', parse_dates=['datetime'], index_col='datetime')


In [27]:
raw_data.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### Regression Model

**Feature engineering**

In [28]:
raw_data['month'] = raw_data.index.map(lambda x : x.month)
raw_data['hour'] = raw_data.index.map(lambda x : x.hour)
raw_data['weekday'] = raw_data.index.map(lambda x : x.weekday() + 1)

In [29]:
raw_data.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,3,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,4,6


**Model Training**

In [38]:
target = 'count'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'humidity', 'windspeed', 'hour', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']

In [39]:
reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
production = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [32]:
reference.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,2,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,3,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,4,6


In [40]:
regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)
regressor.fit(reference[numerical_features + categorical_features], reference[target])

In [41]:
ref_prediction = regressor.predict(reference[numerical_features + categorical_features])
prod_prediction = regressor.predict(production[numerical_features + categorical_features])

In [42]:
reference['prediction'] = ref_prediction
production['prediction'] = prod_prediction

**Model Performance**

In [45]:
column_mapping = ColumnMapping()

column_mapping.target = target
column_mapping.prediction = prediction
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [46]:
regression_perfomance_dashboard = Dashboard(tabs=[RegressionPerformanceTab()])
regression_perfomance_dashboard.calculate(reference, None, column_mapping=column_mapping)

In [47]:
regression_perfomance_dashboard.show()

In [48]:
regression_perfomance_dashboard.save('regression_performance_at_training.html')

**Week 1**

In [49]:
regression_perfomance_dashboard.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], 
                                            column_mapping=column_mapping)

In [50]:
regression_perfomance_dashboard.show()

In [51]:
regression_perfomance_dashboard.save('regression_performance_after_week1.html')

**Week 2**

In [52]:
regression_perfomance_dashboard.calculate(reference, production.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], 
                                            column_mapping=column_mapping)

In [53]:
regression_perfomance_dashboard.show()

In [54]:
regression_perfomance_dashboard.save('regression_performance_after_week2.html')

In [57]:
target_drift_dashboard = Dashboard(tabs=[NumTargetDriftTab()])
target_drift_dashboard.calculate(reference, production.loc['2011-02-07 00:00:00':'2011-02-14 23:00:00'], 
                                   column_mapping=column_mapping)

In [58]:
target_drift_dashboard.show()

In [59]:
target_drift_dashboard.save('target_drift_after_week2.html')

**Week 3**

In [60]:
regression_perfomance_dashboard.calculate(reference, production.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], 
                                            column_mapping=column_mapping)

In [61]:
regression_perfomance_dashboard.show()

In [62]:
regression_perfomance_dashboard.save('regression_performance_after_week3.html')

In [63]:
target_drift_dashboard.calculate(reference, production.loc['2011-02-15 00:00:00':'2011-02-21 23:00:00'], 
                                   column_mapping=column_mapping)

In [64]:
target_drift_dashboard.show()

In [65]:
target_drift_dashboard.save('target_drift_after_week3.html')

**Data Drift**

In [74]:
column_mapping = ColumnMapping()

column_mapping.numerical_features = numerical_features


In [75]:
data_drift_dashboard = Dashboard(tabs=[DataDriftTab()])
data_drift_dashboard.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], 
                                   column_mapping=column_mapping)

In [76]:
data_drift_dashboard.show()

In [78]:
data_drift_dashboard.save("data_drift_dashboard_after_week1.html")

**Data Drift Profile**

In [82]:
data_drift_profile = Profile(sections=[DataDriftProfileSection()])
data_drift_profile.calculate(reference, production.loc['2011-01-29 00:00:00':'2011-02-07 23:00:00'], 
                                   column_mapping=column_mapping)

In [83]:
data_drift_profile.json()

'{"data_drift": {"name": "data_drift", "datetime": "2023-02-03 15:28:31.630242", "data": {"utility_columns": {"date": null, "id": null, "target": null, "prediction": "prediction"}, "cat_feature_names": [], "num_feature_names": ["atemp", "hour", "humidity", "temp", "weekday", "windspeed"], "datetime_feature_names": [], "target_names": null, "text_feature_names": [], "options": {"confidence": null, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 7, "n_drifted_features": 4, "share_drifted_features": 0.5714285714285714, "dataset_drift": true, "prediction": {"current_small_hist": [[0.014548558237878624, 0.007435929766026852, 0.007112628471851771, 0.010668942707777658, 0.006466025883501608, 0.001939807765050484, 0.0003233012941750809, 0.0003233012941750804, 0.003233012941750804, 0.0012932051767003236], [2.2, 20.946, 39.69200000000001, 58.43800000000001, 77.18400000000001, 95.93000000000002, 114.67600000000002, 133.422, 152.168, 170.91400000000002, 189.66]], "ref_s