## Improving Linear Models

#### linear regression does not automatically model non-monotonic relationships between the input features and the target. 
#### Non-linear terms have to be engineered in the input.

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, SplineTransformer

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score
from sklearn.pipeline import make_pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_poisson_deviance

from sklearn.linear_model import Ridge, PoissonRegressor, RidgeCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from feature_engine.creation import CyclicalFeatures

## Bike Sharing Demand dataset

In [3]:
from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml(
    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
df = bike_sharing.frame

#### The target of the prediction problem is the absolute count of bike rentals on a hourly basis:

#### Let us rescale the target variable (number of hourly bike rentals) to predict a relative demand so that the mean absolute error is more easily interpreted as a fraction of the maximum demand.

## Feature and label

In [4]:
y = df['count']/df['count'].max()

y_count = df.pop('count')

X = df

X.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
0,spring,0,1,0,False,6,False,clear,9.84,14.395,0.81,0.0
1,spring,0,1,1,False,6,False,clear,9.02,13.635,0.8,0.0
2,spring,0,1,2,False,6,False,clear,9.02,13.635,0.8,0.0
3,spring,0,1,3,False,6,False,clear,9.84,14.395,0.75,0.0
4,spring,0,1,4,False,6,False,clear,9.84,14.395,0.75,0.0


In [5]:
X['weather'].value_counts()

clear         11413
misty          4544
rain           1419
heavy_rain        3
Name: weather, dtype: int64

In [6]:
# "heavy_rain" cateory is appearing only 3 times in our data, so lets add that to "rain" category
X['weather'] = X['weather'].replace(to_replace='heavy_rain', value='rain')
X['weather'].value_counts()

clear    11413
misty     4544
rain      1422
Name: weather, dtype: int64

## Time-based cross-validation

Since the dataset is a time-ordered event log (hourly demand), we will use a time-sensitive cross-validation splitter to evaluate our demand forecasting model as realistically as possible. 

* We use a gap of 2 days between the train and test side of the splits. 
* We also limit the training set size to make the performance of the CV folds more stable.

In [7]:
tscv = TimeSeriesSplit(
    n_splits=5,
    gap=48, #2day gap
    max_train_size=10000,
    test_size=1000
)

### Model evaluation

In [8]:
def custom_scoring(est, x, y):
    y_pred = est.predict(x)
    mask = y_pred>0
    mae = mean_absolute_error(y[mask], y_pred[mask])
    mse = mean_squared_error(y[mask], y_pred[mask])
    mpd = mean_poisson_deviance(y[mask], y_pred[mask])
    return {'mean_absolute_error': mae, 'mean_squared_error':mse, 'mean_poisson_deviance':mpd}
    


def evaluate_pipeline(pipe, X, y, cv, ):

    score = cross_validate(pipe, X, y, cv=cv, scoring=custom_scoring)
    
    mae = np.mean(score['test_mean_absolute_error'])
    mse = np.mean(score['test_mean_squared_error'])
    mpd = np.mean(score['test_mean_poisson_deviance'])
    
    result = f'Mean absolute error : {mae}\nMean squared error : {mse}\nMean poisson deviance : {mpd}'

    print(result)
    return score  

In [9]:
categorical_columns = [col for col in X.select_dtypes(include='category')]

# STAGE # 1

## Lets Enocode the categroies 
### Preprocessing :  Encode Categorical features

In [10]:
preprocessing_one_hot = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder=MinMaxScaler() )

### Ridge Model : Linear Model

In [11]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing_one_hot, ridge)
score = evaluate_pipeline(ridge_pipe, X, y, tscv)

Mean absolute error : 0.14213437681160074
Mean squared error : 0.03418590523702835
Mean poisson deviance : 0.1295898374336804


# STAGE # 2

## Trigonometric features

### Preprocessing : Encode each of those periodic features using a sine and cosine transformation with the matching period.

In [13]:
time_columns = ['hour', 'month', 'year']

In [20]:
# Using Feature Engine 
cyclic_feat_transformer = CyclicalFeatures(variables=['hour', 'month', 'year'], drop_original=True)

preprocessing_cyclic_and_cat_feat = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ('time', cyclic_feat_transformer, time_columns)
], remainder=MinMaxScaler())

In [17]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing_cyclic_and_cat_feat, ridge)
score = evaluate_pipeline(ridge_pipe, X, y, tscv)

Mean absolute error : 0.1250363191935827
Mean squared error : 0.0349838551472678
Mean poisson deviance : 0.12370741326519172


### Yes we improved the MAE score from 14% of the maximum demand to 12%

# STAGE # 3

## One Hot Encoding

### Preprocessing :  Time-steps as categories

In [19]:
# one-hot-encode the time 

preprocessing_time_and_cat_feat = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ('time', OneHotEncoder(handle_unknown='ignore', sparse_output=False), time_columns)
], remainder=MinMaxScaler())

### Ridge : Linear model

In [21]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing_time_and_cat_feat, ridge)
score = evaluate_pipeline(ridge_pipe, X, y, tscv)

Mean absolute error : 0.09917719145125886
Mean squared error : 0.01738646504056925
Mean poisson deviance : 0.06431691125049027


### Yes we improved the MAE score from 12% of the maximum demand to 10%
* Using one-hot encoding for the time features gives the linear model a lot more flexibility as we introduce one additional feature per discrete time level.
### However, by one-hot-encoding the time related features we are ignoring any assumption implied by the ordering of the time values.


# STAGE # 4

## Periodic spline features

In [26]:
def bsplines(period, n_knots=None, degree=3):
    if n_knots is None:
        n_knots = period

    # periodic and include_bias is True   
    n_knots =  n_knots+1

    knots = np.linspace(0, period, n_knots)[:, np.newaxis]
    return SplineTransformer(n_knots=n_knots, degree=degree, knots=knots, extrapolation='periodic')



preprocessing_spline_and_cat_feat = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ('spline_hour', bsplines(period=24, n_knots=12), ['hour']),
    ('spline_week', bsplines(period=7, n_knots=3), ['weekday']),
    ('spline_month', bsplines(period=12, n_knots=6), ['month'])
], remainder=MinMaxScaler())

In [27]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing_spline_and_cat_feat, ridge)
score = evaluate_pipeline(ridge_pipe, X, y, tscv)

Mean absolute error : 0.09827456551318754
Mean squared error : 0.017873827620043675
Mean poisson deviance : 0.0640415861709067


### Yes we improved the MAE score from 12% of the maximum demand to 10%
Spline features make it possible for the linear model to successfully leverage the periodic time-related features and reduce the error from ~14% to ~10% of the maximum demand, which is similar to what we observed with the one-hot encoded features.

the periodic spline-based features fix those two problems at once: they give more expressivity to the linear model by making it possible to focus on specific hours thanks to the use of 12 splines. Furthermore the extrapolation="periodic" option enforces a smooth representation between hour=23 and hour=0.

# Poisson Regression : Linear Model

* When working with count data the poisson regression usually give better results

In [33]:
poisson = PoissonRegressor(alpha=0.0001)
poisson_pipe = make_pipeline(preprocessing_spline_and_cat_feat, poisson)
score = evaluate_pipeline(poisson_pipe, X, y, tscv)

Mean absolute error : 0.0848698944486918
Mean squared error : 0.014968817306937207
Mean poisson deviance : 0.04675860573113477


Improved the MAE score from ~14% of the maximum demand to ~8%