In [1]:
import numpy as np
import pandas as pd
from neuraxle.base import Identity, MetaStep
from neuraxle.pipeline import Pipeline
from neuraxle.steps.column_transformer import ColumnTransformer
from neuraxle.union import FeatureUnion

from sklearn.datasets import fetch_openml
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.preprocessing import (FunctionTransformer, MinMaxScaler,
                                   OneHotEncoder, OrdinalEncoder,
                                   PolynomialFeatures, SplineTransformer)

In [8]:
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
df = bike_sharing.frame

In [9]:
type(bike_sharing)

sklearn.utils._bunch.Bunch

In [10]:
df.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed,count
0,spring,0.0,1.0,0.0,False,6.0,False,clear,9.84,14.395,0.81,0.0,16.0
1,spring,0.0,1.0,1.0,False,6.0,False,clear,9.02,13.635,0.8,0.0,40.0
2,spring,0.0,1.0,2.0,False,6.0,False,clear,9.02,13.635,0.8,0.0,32.0
3,spring,0.0,1.0,3.0,False,6.0,False,clear,9.84,14.395,0.75,0.0,13.0
4,spring,0.0,1.0,4.0,False,6.0,False,clear,9.84,14.395,0.75,0.0,1.0


In [11]:
y = df["count"] / 1000

In [12]:
y.head()

0    0.016
1    0.040
2    0.032
3    0.013
4    0.001
Name: count, dtype: float64

In [13]:
X_orig = df.copy()
X = df.drop("count", axis="columns")
X

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather,temp,feel_temp,humidity,windspeed
0,spring,0.0,1.0,0.0,False,6.0,False,clear,9.84,14.395,0.81,0.0000
1,spring,0.0,1.0,1.0,False,6.0,False,clear,9.02,13.635,0.80,0.0000
2,spring,0.0,1.0,2.0,False,6.0,False,clear,9.02,13.635,0.80,0.0000
3,spring,0.0,1.0,3.0,False,6.0,False,clear,9.84,14.395,0.75,0.0000
4,spring,0.0,1.0,4.0,False,6.0,False,clear,9.84,14.395,0.75,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...
17374,spring,1.0,12.0,19.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014
17375,spring,1.0,12.0,20.0,False,1.0,True,misty,10.66,12.880,0.60,11.0014
17376,spring,1.0,12.0,21.0,False,1.0,True,clear,10.66,12.880,0.60,11.0014
17377,spring,1.0,12.0,22.0,False,1.0,True,clear,10.66,13.635,0.56,8.9981


In [25]:
X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)

In [26]:
categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
]

non_categorical_columns = [i for i in X.columns if i not in categorical_columns]

categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]

ordinal_encoder = OrdinalEncoder(categories=categories)

In [27]:
ordinal_encoder

In [28]:

gbrt_pipeline = Pipeline([
    ColumnTransformer([
        (categorical_columns, ordinal_encoder),
        (non_categorical_columns, Identity()),
    ], n_dimension=2),
    HistGradientBoostingRegressor(
        categorical_features=range(4),
    ),
])

In [34]:
gbrt_pipeline.fit(X[:15000], y[:15000])

Pipeline([
    ColumnTransformer([
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(OrdinalEncoder(...)) object 0x2874c5430>
        ]),
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            Identity()
        ]),
        NumpyConcatenateInnerFeatures(name='joiner')
    ]),
    <neuraxle.steps.sklearn.SKLearnWrapper(HistGradientBoostingRegressor(...)) object 0x2874b5820>
])

In [35]:
gbrt_pipeline.predict(X[15001:])

array([0.04617299, 0.11563257, 0.21637541, ..., 0.12484598, 0.08502095,
       0.05340398])

In [23]:
categorical_one_hot_encoders = [
    (col_name, OneHotEncoder(handle_unknown="ignore", sparse=False))
    for col_name in categorical_columns
]
alphas = np.logspace(-6, 6, 25)

naive_linear_pipeline = Pipeline([
    ColumnTransformer([
        *categorical_one_hot_encoders,
        (non_categorical_columns, MinMaxScaler()),
    ], n_dimension=2),
    RidgeCV(alphas=alphas),
])

In [30]:
naive_linear_pipeline.fit(X, y)

Pipeline([
    ColumnTransformer([
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(OneHotEncoder(...)) object 0x285ecc100>
        ]),
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(OneHotEncoder(...)) object 0x285eccbb0>
        ]),
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(OneHotEncoder(...)) object 0x285ecc850>
        ]),
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(OneHotEncoder(...)) object 0x285ecc4f0>
        ]),
        Pipeline([
            ColumnsSelectorND(ColumnSelector2D()),
            <neuraxle.steps.sklearn.SKLearnWrapper(MinMaxScaler(...)) object 0x285ecc0a0>
        ]),
        NumpyConcatenateInnerFeatures(name='joiner')
    ]),
    <neuraxle.steps.sklearn.SKLearnWrapp

In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   year        17379 non-null  float64 
 2   month       17379 non-null  float64 
 3   hour        17379 non-null  float64 
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  float64 
 6   workingday  17379 non-null  category
 7   weather     17379 non-null  category
 8   temp        17379 non-null  float64 
 9   feel_temp   17379 non-null  float64 
 10  humidity    17379 non-null  float64 
 11  windspeed   17379 non-null  float64 
dtypes: category(4), float64(8)
memory usage: 1.1 MB
