In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer, SplineTransformer, PolynomialFeatures

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_validate, cross_val_score
from sklearn.pipeline import make_pipeline, FeatureUnion

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_poisson_deviance

from sklearn.kernel_approximation import Nystroem

from sklearn.linear_model import Ridge, PoissonRegressor, RidgeCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor


from feature_engine.creation import CyclicalFeatures



## Bike Sharing Demand dataset

In [2]:
from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml(
    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
df = bike_sharing.frame

In [3]:
df.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed,count
0,spring,0,1,0,False,6,False,clear,9.84,14.395,0.81,0.0,16
1,spring,0,1,1,False,6,False,clear,9.02,13.635,0.8,0.0,40
2,spring,0,1,2,False,6,False,clear,9.02,13.635,0.8,0.0,32
3,spring,0,1,3,False,6,False,clear,9.84,14.395,0.75,0.0,13
4,spring,0,1,4,False,6,False,clear,9.84,14.395,0.75,0.0,1


**The target of the prediction problem is the absolute count of bike rentals on a hourly basis:**

In [55]:
sns.histplot(df['count']);
plt.title('Count data distribution');

<img src='./plots/Count-data-distribution.png'>


### Yearly pattern

In [41]:
temp_df = df.groupby(['year','month'])['count'].agg(['mean'])
sns.lineplot(data=temp_df, y='mean', x='month', hue='year');

<img src='./plots/avg-monthly-count.png'>

In [14]:
yearly_pattern = df.groupby(['month'])['count'].agg(['mean'])

temp_df = df.copy()
temp_df['yearly-pattern'] = temp_df['month'].apply(lambda m: yearly_pattern.loc[m]['mean'])

In [19]:
temp_df.plot(y=['yearly-pattern'])
plt.xticks([]);

<img src='./plots/yearly-pattern.png'>

## Weekly pattern

In [26]:
ax = df.groupby(['weekday','hour'])['count'].agg(['mean']).plot(y=['mean'])
ax.set(
    title="Average hourly bike demand during the week",
    xticks=[i * 24 for i in range(7)],
    xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
    xlabel="Time of the week",
    ylabel="Number of bike rentals",
);

<img src='./plots/weekly-pattern.png'>

#### The target of the prediction problem is the absolute count of bike rentals on a hourly basis:

#### Let us rescale the target variable (number of hourly bike rentals) to predict a relative demand so that the mean absolute error is more easily interpreted as a fraction of the maximum demand.

In [27]:
df["count"].max()

977

In [4]:
y = df['count']/df['count'].max()

In [50]:
plt.figure(figsize=(15,4))
plt.subplot(121)
sns.histplot(df['count'])
plt.title('Count')
plt.subplot(122)
sns.histplot(y)
plt.title('Relative demand : count/max(count)');

<img src='./plots/relative-demand.png'>

## Feature and label

In [4]:
y = df['count']/df['count'].max()

y_count = df.pop('count')

X = df

X.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
0,spring,0,1,0,False,6,False,clear,9.84,14.395,0.81,0.0
1,spring,0,1,1,False,6,False,clear,9.02,13.635,0.8,0.0
2,spring,0,1,2,False,6,False,clear,9.02,13.635,0.8,0.0
3,spring,0,1,3,False,6,False,clear,9.84,14.395,0.75,0.0
4,spring,0,1,4,False,6,False,clear,9.84,14.395,0.75,0.0


In [52]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   year        17379 non-null  int64   
 2   month       17379 non-null  int64   
 3   hour        17379 non-null  int64   
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  int64   
 6   workingday  17379 non-null  category
 7   weather     17379 non-null  category
 8   temp        17379 non-null  float64 
 9   feel_temp   17379 non-null  float64 
 10  humidity    17379 non-null  float64 
 11  windspeed   17379 non-null  float64 
dtypes: category(4), float64(4), int64(4)
memory usage: 1.1 MB


In [80]:
for col in X.select_dtypes(include='category'):
    print(X[col].nunique(),  X[col].dtype, 'in', col)
    [print(f'{k !r:10s}', v) for k,v in X[col].value_counts().items()]


4 category in season
'fall'     4496
'summer'   4409
'spring'   4242
'winter'   4232
2 category in holiday
'False'    16879
'True'     500
2 category in workingday
'True'     11865
'False'    5514
4 category in weather
'clear'    11413
'misty'    4544
'rain'     1419
'heavy_rain' 3


#### Since there are only `3` "heavy_rain" events, we cannot use this category to train machine learning models with cross validation. Instead, we simplify the representation by collapsing those into the "rain" category.

In [81]:
X['weather'] = X['weather'].replace(to_replace='heavy_rain', value='rain')
X['weather'].value_counts()

clear    11413
misty     4544
rain      1422
Name: weather, dtype: int64

## Time-based cross-validation

Since the dataset is a time-ordered event log (hourly demand), we will use a time-sensitive cross-validation splitter to evaluate our demand forecasting model as realistically as possible. 

* We use a gap of 2 days between the train and test side of the splits. 
* We also limit the training set size to make the performance of the CV folds more stable.

In [5]:


tscv = TimeSeriesSplit(
    n_splits=5,
    gap=48, #2day gap
    max_train_size=10000,
    test_size=1000
)

In [74]:
all_splits = list(tscv.split(X,y))

## Gradient Boosting
Gradient Boosting Regression with decision trees is often flexible enough to efficiently handle heteorogenous tabular data with a mix of categorical and numerical features as long as the number of samples is large enough.

#### Data preprocessing
* Categorical variables are encoded using ordinal encoding and then let the model know that it should treat those as categorical variables by using a dedicated tree splitting rule. 

* Since we use an ordinal encoder, we pass the list of categorical values explicitly to use a logical order when encoding the categories as integers instead of the lexicographical order. This also has the added benefit of preventing any issue with unknown categories when using cross-validation.

* The numerical variables need no preprocessing 

In [7]:
categorical_columns = [col for col in X.select_dtypes(include='category')]
categorical_columns

['season', 'holiday', 'workingday', 'weather']

In [8]:
categories = [list(X[col].value_counts().index) for col in categorical_columns]
categories

[['fall', 'summer', 'spring', 'winter'],
 ['False', 'True'],
 ['True', 'False'],
 ['clear', 'misty', 'rain', 'heavy_rain']]

In [9]:
categories_encoder = OrdinalEncoder(categories=categories)

categories_transformer = ColumnTransformer(transformers=[
    ('categories', categories_encoder, categorical_columns)
], verbose_feature_names_out=False, remainder='passthrough')

### Model Pipeline

In [13]:
gbrt_model = HistGradientBoostingRegressor(loss='poisson', categorical_features=categorical_columns)
gbrt_pipe = make_pipeline(categories_transformer, gbrt_model).set_output(transform="pandas")

### Model Evaluation

In [162]:
def evaluate_pipeline(pipe, X, y, cv, is_ridge=False):

    scoring=["neg_mean_absolute_error", 
             "neg_root_mean_squared_error",
            ]

    scoring = scoring if is_ridge else scoring + ["neg_mean_poisson_deviance"]
    score = cross_validate(pipe, X, y, cv=cv, scoring=scoring)
    
    mae = -1 * np.mean(score['test_neg_mean_absolute_error'])
    mse = -1 * np.mean(score['test_neg_root_mean_squared_error'])

    if not is_ridge:
        mpd = -1 * np.mean(score['test_neg_mean_poisson_deviance'])
    result = f'Mean absolute error : {mae}\nMean squared error : {mse}'
    result = result if is_ridge else result+f'\nMean poisson deviance : {mpd}'
    print(result)
    return score  

In [54]:
score = evaluate_pipeline(gbrt_pipe, X , y, tscv)

Mean absolute error : 0.043542656365207086
Mean squared error : 0.06805682902151214
Mean poisson deviance : 0.015468297214049223


* This model has an average error around 4 to 5% of the maximum demand. 
* This is quite good for a first trial without any hyper-parameter tuning! We just had to make the categorical variables explicit. 

* Note that the time related features are passed as is, i.e. without processing them. But this is not much of a problem for tree-based models as they can learn a non-monotonic relationship between ordinal input features and the target. This is not the case for linear regression models

## Ridge Regression : Linear Model

* For linear models, categorical variables need to be one-hot encoded. 
* For consistency, we scale the numerical features to the same 0-1 range 
    * using :  `sklearn.preprocessing.MinMaxScaler`

In [11]:
preprocessing = {}

preprocessing['one-hot-cat-feats'] = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder=MinMaxScaler() )

ridge = RidgeCV(alphas=np.logspace(-6,6,25))

ridge_pipe = make_pipeline(preprocessing['one-hot-cat-feats'], ridge)

In [12]:
score = evaluate_pipeline(ridge_pipe, X, y, tscv, is_ridge=True)

Mean absolute error : 0.14208882769310194
Mean squared error : 0.18377516494744364


* The performance is not good: the average error is around 14% of the maximum demand. 
* This is more than three times higher than the average error of the gradient boosting model. 
* We can suspect that the naive original encoding (merely min-max scaled) of the periodic time-related features might prevent the linear regression model to properly leverage the time information: 
* linear regression does not automatically model non-monotonic relationships between the input features and the target. 
* Non-linear terms have to be engineered in the input.

## Feature Engineering : Time-steps as categories
* Non-linear terms have to be engineered in the input.
* Since the time features are encoded in a discrete manner using integers 
* 24 unique values in the “hours” feature
* We could decide to treat those as categorical variables using a one-hot encoding 
* When we do onehot encoding we are ignoring any assumption implied by the ordering of the hour values.
* Using one-hot encoding for the time features gives the linear model a lot more flexibility as we introduce one additional feature per discrete time level.

In [13]:
time_one_hot_transformer = OneHotEncoder(handle_unknown='ignore')
time_columns = ['hour', 'month', 'year', 'weekday']

preprocessing['time-step-as-cat'] = ColumnTransformer(transformers=[
    ('categories', categories_transformer, categorical_columns),
    ('time', time_one_hot_transformer, time_columns)
], remainder=MinMaxScaler())

### Ridge : Linear model

In [14]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing['time-step-as-cat'], ridge)

score = evaluate_pipeline(ridge_pipe, X, y, cv=tscv, is_ridge=True)

Mean absolute error : 0.09900738311949747
Mean squared error : 0.1317585290395531


#### The average error rate of this model is 10% which is much better than using the original (ordinal) encoding of the time feature, confirming our intuition that the linear regression model benefits from the added flexibility to not treat time progression in a monotonic manner.

#### However, this introduces a very large number of new features. 
#### This could cause some significant overfitting. 

#### To avoid this we could use `sklearn.preprocessing.KBinsDiscretizer` instead to re-bin the number of levels of fine-grained ordinal or numerical variables while still benefitting from the non-monotonic expressivity advantages of one-hot encoding.

#### The one-hot encoding completely ignores the ordering of the hour levels while this could be an interesting inductive bias to preserve to some level. 

#### Lets try to explore smooth, non-monotonic encoding that locally preserves the relative ordering of time features.

## Trigonometric features
* #### Encode each of those periodic features using a sine and cosine transformation with the matching period.

### Cyclic Feature creation  Using Feature Engine 

In [16]:
# Using Feature Engine 
cyclic_feat_transformer = CyclicalFeatures(variables=['hour', 'month', 'weekday'], drop_original=True)

In [17]:
preprocessing['feature-engine-cyclic-feat'] = ColumnTransformer(transformers=[
    ('categories', categories_transformer, categorical_columns),
    ('time', cyclic_feat_transformer, ['hour', 'month', 'weekday'])
], remainder=MinMaxScaler())

### Ridge Linear model

In [18]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing['feature-engine-cyclic-feat'], ridge)

score = evaluate_pipeline(ridge_pipe, X, y, cv=tscv, is_ridge=True)

Mean absolute error : 0.12489963757582949
Mean squared error : 0.16896780602464204


### Using python and function transformer : to encode time related features

In [19]:
# using python and function transformer

def sine_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / (2*np.pi*period)) )

def cosine_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / (2*np.pi*period)) )

In [20]:
preprocessing['function-transformer-cyclic-feat'] = ColumnTransformer(transformers=[
    ('categories', categories_transformer, categorical_columns),
    ('hour_sine', sine_transformer(period=24), ['hour']),
    ('hour_cosine', cosine_transformer(period=24), ['hour']),
    ('weekday_sine', sine_transformer(period=7), ['weekday']),
    ('weekday_cosine', cosine_transformer(period=7), ['weekday']),
    ('month_sine', sine_transformer(period=12), ['month']),
    ('month_cosine', cosine_transformer(period=12), ['month']),
], remainder=MinMaxScaler())


### Ridge : Linear model

In [21]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing['function-transformer-cyclic-feat'], ridge)

score = evaluate_pipeline(ridge_pipe, X, y, cv=tscv, is_ridge=True)

Mean absolute error : 0.13267727130310028
Mean squared error : 0.16870052123758555


#### The performance of our linear regression model with this simple feature engineering is a bit better than using the original ordinal time features but worse than using the one-hot encoded time features.

## Periodic spline features

In [22]:
def bsplines(period, n_knots=None, degree=3):
    if n_knots is None:
        n_knots = period

    # periodic and include_bias is True   
    n_knots =  n_knots+1

    knots = np.linspace(0, period, n_knots)[:, np.newaxis]
    return SplineTransformer(n_knots=n_knots, degree=degree, knots=knots, extrapolation='periodic')



preprocessing['splines'] = ColumnTransformer(transformers=[
    ('categories', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ('spline_hour', bsplines(period=24, n_knots=12), ['hour']),
    ('spline_week', bsplines(period=7, n_knots=3), ['weekday']),
    ('spline_month', bsplines(period=12, n_knots=6), ['month'])
], remainder=MinMaxScaler())

### Ridge : Linear model

In [23]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline(preprocessing['splines'], ridge)

score = evaluate_pipeline(ridge_pipe, X, y, cv=tscv, is_ridge=True)

Mean absolute error : 0.09758859484448687
Mean squared error : 0.13232174908350575


## Visualize prediction

In [189]:
def visualize_prediction(model, prep, split=-1, title='Model performance'):
    train_id, test_id = all_splits[split]
    model = make_pipeline(preprocessing[prep], model)
    model.fit(X.iloc[train_id], y.iloc[train_id])
    y_preds = model.predict(X.iloc[test_id])
    y_true = y.iloc[test_id]

    plt.figure(figsize=(15,4))
    plt.plot(y_preds, c='b', alpha=0.6, linewidth=2, label='y-pred')
    plt.plot(y_true.reset_index(drop=True), c='k', linewidth=3, alpha=0.6, label='y-true')  
    mae = mean_absolute_error(y_true , y_preds)
    plt.title(f'Prediction on Test split : {split}\nMean absolute error : {mae :0.2f}\n{title}')
    plt.legend()



In [77]:
preprocessing.keys()

dict_keys(['one-hot-cat-feats', 'time-step-as-cat', 'feature-engine-cyclic-feat', 'function-transformer-cyclic-feat', 'splines'])

#### The raw ordinal time-related features are problematic because they do not capture the natural periodicity

* we observe a big jump in the predictions at the end of each day when the hour features goes from 23 back to 0. 
* We can expect similar artifacts at the end of each week or each year.

In [174]:
visualize_prediction(model=ridge_pipe, prep='one-hot-cat-feats', split=4, title='Using raw ordianl time-related features') 

<img src='./plots/prediction-on-test-split-4-using-raw-time-feats.png'>

#### The trigonometric features (sine and cosine) do not have these discontinuities at midnight, but the linear regression model fails to leverage those features to properly model intra-day variations. 

* Using trigonometric features for higher harmonics or additional trigonometric features for the natural period with different phases could potentially fix this problem.


In [176]:
visualize_prediction(model=ridge_pipe, prep='feature-engine-cyclic-feat', split=4, title='Using Trigonometric features') 

<img src='./plots/prediction-on-test-split-4-using-trigonometric-fn.png'>

#### The one-hot encoded features behave similarly to the periodic spline-based features but are more spiky: 
* for instance they can better model the morning peak during the week days since this peak lasts shorter than an hour. 

In [178]:
visualize_prediction(model=ridge_pipe, prep='time-step-as-cat', split=4, title='One hot encode the time related features') 

<img src='./plots/prediction-on-test-split-4-using-onhot-time-feats.png'>

#### The periodic spline-based features fix those two problems at once: 
* they give more expressivity to the linear model by making it possible to focus on specific hours thanks to the use of 12 splines.
* Furthermore the extrapolation="periodic" option enforces a smooth representation between hour=23 and hour=0.

In [180]:
visualize_prediction(model=ridge_pipe, prep='splines', split=4, title='Using Bspline') 

<img src='./plots/prediction-on-test-split-4-using-bsplines.png'>

## Poisson : Linear model

for count data poisson regression model perform better

In [104]:
poisson = PoissonRegressor(alpha=0.0001)
poisson_pipe = make_pipeline(preprocessing['splines'], poisson)

score = evaluate_pipeline(poisson_pipe, X, y, cv=tscv)

Mean absolute error : 0.08485637886975902
Mean squared error : 0.12176804729934257
Mean poisson deviance : 0.04677918910686696


In [191]:

visualize_prediction(model=PoissonRegressor(alpha=0.0001), prep='splines', split=4, title='Using spline Feature and Possion regression model') 

<img src='./plots/poisson-prediction-test-split-4.png'>

## Gradient Boosted Trees

Tree models has Better capacity to model the data

In [195]:
gbrt = HistGradientBoostingRegressor(loss='poisson')
gbrt_pipe = make_pipeline(preprocessing['splines'], gbrt)

score = evaluate_pipeline(gbrt_pipe, X, y, cv=tscv)

Mean absolute error : 0.044157374703357066
Mean squared error : 0.06831652920376208
Mean poisson deviance : 0.01591682812154367


In [198]:
visualize_prediction(model=HistGradientBoostingRegressor(loss='poisson'), prep='splines', split=4, title='Using spline Feature and HistGradientBoostingRegressor model') 

<img src='./plots/GBRT-prediction-on-test-split-4-plot.png'>

#### This confirms that the one-hot encoding and the spline encoding strategies create a lot more features for the time representation than the alternatives, which in turn gives the downstream linear model more flexibility (degrees of freedom) to avoid underfitting.

#### Finally, we observe that **none of the linear models** can approximate the true bike rentals demand, especially for the peaks that can be very sharp at rush hours during the working days but much flatter during the week-ends: 

#### Even the most accurate linear models based on splines or one-hot encoding tend to under-estimate the commuting-related events during the working days.

#### These systematic prediction errors reveal a form of under-fitting and can be explained by the lack of interactions terms between features, e.g. “`workingday`” and features derived from “`hours`”. 

## Modeling pairwise interactions with splines and polynomial features
Linear models do not automatically capture interaction effects between input features. 

Features constructed by SplineTransformer or one-hot encoding or binning are  marginally non-linear

However, it is possible to use the PolynomialFeatures class on `coarse grained spline encoded hours` to model the “`workingday`”/”`hours`” interaction explicitly without introducing too many new variables:

In [148]:
# Hour -- feature
n_knots = 8
period = 24
knots = np.linspace(0, period, n_knots)[:, np.newaxis]

spline_transform = SplineTransformer(n_knots=n_knots, knots=knots, extrapolation='periodic')

In [149]:
# working day -- feature
X['workingday'].value_counts()

True     11865
False     5514
Name: workingday, dtype: int64

In [150]:
def working_day_transform():
    return FunctionTransformer(func=lambda x: x=='True')

In [151]:
# features
preprocessing['splines-coarse-grained'] = ColumnTransformer(
    transformers=[
        ('cyclic_hour', spline_transform, ['hour']),
        ('workingDay', working_day_transform(), ['workingday'])
    ]
)

In [152]:
# feature interaction
hour_workday_interaction = make_pipeline(
    preprocessing['splines-coarse-grained'], 
    PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
)

In [134]:
# example -- output shape after feature processing
preprocessing['splines-coarse-grained'].fit_transform(X[['hour', 'workingday']]).shape

(17379, 8)

In [137]:
# example interaction modeling
hour_workday_interaction.fit_transform(X[['hour', 'workingday']]).shape

(17379, 36)

Those features are then combined with the ones already computed in the previous spline-base pipeline. We can observe a nice performance improvemnt by modeling this pairwise interaction explicitly:

In [163]:
preprocessing['splines_interaction'] = FeatureUnion(transformer_list=[
    ('marginal', preprocessing['splines']),
    ('interaction', hour_workday_interaction)
])

In [171]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline( preprocessing['splines_interaction'], ridge)
score = evaluate_pipeline(ridge_pipe, X , y, tscv, is_ridge=True)

Mean absolute error : 0.07715996024731678
Mean squared error : 0.10335407739035869


## Linear model : Ridge

In [165]:
ridge = RidgeCV(alphas=np.logspace(-6,6,25))
ridge_pipe = make_pipeline( preprocessing['splines_interaction'], ridge)

ridge_pipe.fit(X.iloc[train_id], y.iloc[train_id])

In [205]:
visualize_prediction(model=ridge, prep='splines_interaction', split=4, title='Linear Model: Modeling pairwise interactions with splines and polynomial features')

<img src='./plots/Ridge-model-improvement-with-interaction-feats.png'>

## Linear Model : Poisson

In [231]:
poisson = PoissonRegressor(alpha=0.0001)
poisson_pipe = make_pipeline(preprocessing['splines_interaction'], poisson)
score = evaluate_pipeline(poisson_pipe, X, y, tscv, is_ridge=False)

Mean absolute error : 0.056289739397765104
Mean squared error : 0.08544269007334207
Mean poisson deviance : 0.02236178176984247


In [206]:
poisson = PoissonRegressor(alpha=0.0001)
visualize_prediction(model=poisson, prep='splines_interaction', split=4, title='Linear Model: Modeling pairwise interactions with splines and polynomial features')

<img src='./plots/Poisson-model-improvement-with-interaction-feats.png'>

## Modeling non-linear feature interactions with kernels

The previous analysis highlighted the need to model the interactions between "`workingday`" and "`hours`".

Another example of a such a non-linear interaction that we would like to model could be the impact of the rain that might not be the same during the working days and the week-ends and holidays for instance.

To model all such interactions, we could either use a polynomial expansion on all marginal features at once, after their spline-based expansion. However, this would create a `quadratic number of features` which can cause overfitting and computational tractability issues.

Alternatively, we can use the Nyström method to compute an approximate polynomial kernel expansion.

In [250]:
# To model all such interactions, we  we can use the Nyström method 
# to compute an approximate polynomial kernel expansion on all marginal features at once
bsplines_and_interaction_pipe = make_pipeline(preprocessing['splines'],
                                Nystroem(kernel='poly', degree=3, n_components=300), 
                                RidgeCV(alphas=np.logspace(-6,6,25)))

In [245]:
bsplines_and_interaction_pipe.fit(X, y)

In [244]:
score = evaluate_pipeline(bsplines_and_interaction_pipe, X ,y, tscv, is_ridge=True)

Mean absolute error : 0.05426023326939152
Mean squared error : 0.07684658790572899


In [269]:
preprocessing['bsplines_and_polynomial_kernel'] = make_pipeline(preprocessing['splines'], Nystroem(kernel='poly', degree=3, n_components=300))
visualize_prediction(model=RidgeCV(alphas=np.logspace(-6,6,25)), prep='bsplines_and_polynomial_kernel', split=-1, title='Ridge regression : Modeling Interaction and bsplines')

<img src='./plots/Ridge-model-improvement-with-interaction-and-splines-with-poly-kernel.png'>

## Linear Model : Poisson

In [268]:
poisson = PoissonRegressor(alpha=0.00001)
poisson_pipe = make_pipeline(preprocessing['bsplines_and_polynomial_kernel'], poisson)
score = evaluate_pipeline(poisson_pipe, X, y, tscv)

Mean absolute error : 0.059064050981979456
Mean squared error : 0.08698063771034835
Mean poisson deviance : 0.02539011814901082


In [270]:
visualize_prediction(model=PoissonRegressor(alpha=0.00001), prep='bsplines_and_polynomial_kernel', split=4, title='Poisson regression : Modeling Interaction and bsplines')

<img src='./plots/Poisson-model-improvement-with-interaction-and-splines-with-poly-kernel.png'>