Skip to content

Commit

Permalink
DOC user polars instead of polars in plot_time_series_lagged_features (
Browse files Browse the repository at this point in the history
…#28601)

Co-authored-by: raisa <>
  • Loading branch information
raisadz committed Apr 8, 2024
1 parent 016670e commit d1d1596
Showing 1 changed file with 57 additions and 68 deletions.
125 changes: 57 additions & 68 deletions examples/applications/plot_time_series_lagged_features.py
Expand Up @@ -3,7 +3,7 @@
Lagged features for time series forecasting
===========================================
This example demonstrates how pandas-engineered lagged features can be used
This example demonstrates how Polars-engineered lagged features can be used
for time series forecasting with
:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing
Demand dataset.
Expand All @@ -19,27 +19,33 @@
# Analyzing the Bike Sharing Demand dataset
# -----------------------------------------
#
# We start by loading the data from the OpenML repository.
# We start by loading the data from the OpenML repository
# as a pandas dataframe. This will be replaced with Polars
# once `fetch_openml` adds a native support for it.
# We convert to Polars for feature engineering, as it automatically caches
# common subexpressions which are reused in multiple expressions
# (like `pl.col("count").shift(1)` below). See
# https://docs.pola.rs/user-guide/lazy/optimizations/ for more information.

import numpy as np
import pandas as pd
import polars as pl

from sklearn.datasets import fetch_openml

pl.Config.set_fmt_str_lengths(20)

bike_sharing = fetch_openml(
"Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
df = bike_sharing.frame
df = pl.DataFrame({col: df[col].to_numpy() for col in df.columns})

# %%
# Next, we take a look at the statistical summary of the dataset
# so that we can better understand the data that we are working with.
summary = pd.DataFrame(df.describe())
summary = (
summary.style.background_gradient()
.set_table_attributes("style = 'display: inline'")
.set_caption("Statistics of the Dataset")
.set_table_styles([{"selector": "caption", "props": [("font-size", "16px")]}])
)
import polars.selectors as cs

summary = df.select(cs.numeric()).describe()
summary

# %%
Expand All @@ -52,32 +58,26 @@


# %%
# Generating pandas-engineered lagged features
# Generating Polars-engineered lagged features
# --------------------------------------------
# Let's consider the problem of predicting the demand at the
# next hour given past demands. Since the demand is a continuous
# variable, one could intuitively use any regression model. However, we do
# not have the usual `(X_train, y_train)` dataset. Instead, we just have
# the `y_train` demand data sequentially organized by time.
count = df["count"]
lagged_df = pd.concat(
[
count,
count.shift(1).rename("lagged_count_1h"),
count.shift(2).rename("lagged_count_2h"),
count.shift(3).rename("lagged_count_3h"),
count.shift(24).rename("lagged_count_1d"),
count.shift(24 + 1).rename("lagged_count_1d_1h"),
count.shift(7 * 24).rename("lagged_count_7d"),
count.shift(7 * 24 + 1).rename("lagged_count_7d_1h"),
count.shift(1).rolling(24).mean().rename("lagged_mean_24h"),
count.shift(1).rolling(24).max().rename("lagged_max_24h"),
count.shift(1).rolling(24).min().rename("lagged_min_24h"),
count.shift(1).rolling(7 * 24).mean().rename("lagged_mean_7d"),
count.shift(1).rolling(7 * 24).max().rename("lagged_max_7d"),
count.shift(1).rolling(7 * 24).min().rename("lagged_min_7d"),
],
axis="columns",
lagged_df = df.select(
"count",
*[pl.col("count").shift(i).alias(f"lagged_count_{i}h") for i in [1, 2, 3]],
lagged_count_1d=pl.col("count").shift(24),
lagged_count_1d_1h=pl.col("count").shift(24 + 1),
lagged_count_7d=pl.col("count").shift(7 * 24),
lagged_count_7d_1h=pl.col("count").shift(7 * 24 + 1),
lagged_mean_24h=pl.col("count").shift(1).rolling_mean(24),
lagged_max_24h=pl.col("count").shift(1).rolling_max(24),
lagged_min_24h=pl.col("count").shift(1).rolling_min(24),
lagged_mean_7d=pl.col("count").shift(1).rolling_mean(7 * 24),
lagged_max_7d=pl.col("count").shift(1).rolling_max(7 * 24),
lagged_min_7d=pl.col("count").shift(1).rolling_min(7 * 24),
)
lagged_df.tail(10)

Expand All @@ -89,8 +89,8 @@
# %%
# We can now separate the lagged features in a matrix `X` and the target variable
# (the counts to predict) in an array of the same first dimension `y`.
lagged_df = lagged_df.dropna()
X = lagged_df.drop("count", axis="columns")
lagged_df = lagged_df.drop_nulls()
X = lagged_df.drop("count")
y = lagged_df["count"]
print("X shape: {}\ny shape: {}".format(X.shape, y.shape))

Expand Down Expand Up @@ -141,8 +141,8 @@
# %%
# Training the model and evaluating its performance based on MAPE.
train_idx, test_idx = all_splits[0]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
X_train, X_test = X[train_idx, :], X[test_idx, :]
y_train, y_test = y[train_idx], y[test_idx]

model = HistGradientBoostingRegressor().fit(X_train, y_train)
y_pred = model.predict(X_test)
Expand Down Expand Up @@ -258,40 +258,29 @@ def consolidate_scores(cv_results, scores, metric):
metric = key.split("test_")[1]
scores = consolidate_scores(cv_results, scores, metric)

df = pd.DataFrame(scores)

styled_df_copy = df.copy()


def extract_numeric(value):
parts = value.split("±")
mean_value = float(parts[0])
std_value = float(parts[1].split()[0])

return mean_value, std_value
scores_df = pl.DataFrame(scores)
scores_df


# Convert columns containing "±" to tuples of numerical values
cols_to_convert = df.columns[1:] # Exclude the "loss" column
for col in cols_to_convert:
df[col] = df[col].apply(extract_numeric)

min_values = df.min()

# Create a mask for highlighting minimum values
mask = pd.DataFrame("", index=df.index, columns=df.columns)
for col in cols_to_convert:
mask[col] = df[col].apply(
lambda x: "font-weight: bold" if x == min_values[col] else ""
)

styled_df_copy = styled_df_copy.style.apply(lambda x: mask, axis=None)
styled_df_copy

# %%
# Let us take a look at the losses that minimise each metric.
def min_arg(col):
col_split = pl.col(col).str.split(" ")
return pl.arg_sort_by(
col_split.list.get(0).cast(pl.Float64),
col_split.list.get(2).cast(pl.Float64),
).first()


scores_df.select(
pl.col("loss").get(min_arg(col_name)).alias(col_name)
for col_name in scores_df.columns
if col_name != "loss"
)

# %%
# Even if the score distributions overlap due to the variance in the dataset, it
# is true that the average RMSE is lower when `loss="squared_error"`, whereas
# Even if the score distributions overlap due to the variance in the dataset,
# it is true that the average RMSE is lower when `loss="squared_error"`, whereas
# the average MAPE is lower when `loss="absolute_error"` as expected. That is
# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score
# corresponding to the 50 quantile loss is overlapping with the score obtained
Expand All @@ -304,8 +293,8 @@ def extract_numeric(value):
all_splits = list(ts_cv.split(X, y))
train_idx, test_idx = all_splits[0]

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
X_train, X_test = X[train_idx, :], X[test_idx, :]
y_train, y_test = y[train_idx], y[test_idx]

max_iter = 50
gbrt_mean_poisson = HistGradientBoostingRegressor(loss="poisson", max_iter=max_iter)
Expand Down Expand Up @@ -336,7 +325,7 @@ def extract_numeric(value):
fig, ax = plt.subplots(figsize=(15, 7))
plt.title("Predictions by regression models")
ax.plot(
y_test.values[last_hours],
y_test[last_hours],
"x-",
alpha=0.2,
label="Actual demand",
Expand Down Expand Up @@ -399,7 +388,7 @@ def extract_numeric(value):
]
for ax, pred, label in zip(axes, predictions, labels):
PredictionErrorDisplay.from_predictions(
y_true=y_test.values,
y_true=y_test,
y_pred=pred,
kind="residual_vs_predicted",
scatter_kwargs={"alpha": 0.3},
Expand Down

0 comments on commit d1d1596

Please sign in to comment.