Skip to content

Commit

Permalink
replace pandas with Polars for forecasting co2 example
Browse files Browse the repository at this point in the history
  • Loading branch information
raisa committed Apr 10, 2024
1 parent 77f8731 commit 77bdbd8
Showing 1 changed file with 23 additions and 19 deletions.
42 changes: 23 additions & 19 deletions examples/gaussian_process/plot_gpr_co2.py
Expand Up @@ -33,32 +33,33 @@
# We will derive a dataset from the Mauna Loa Observatory that collected air
# samples. We are interested in estimating the concentration of CO2 and
# extrapolate it for further year. First, we load the original dataset available
# in OpenML.
# in OpenML as a pandas dataframe. This will be replaced with Polars
# once `fetch_openml` adds a native support for it.
from sklearn.datasets import fetch_openml

co2 = fetch_openml(data_id=41187, as_frame=True)
co2.frame.head()

# %%
# First, we process the original dataframe to create a date index and select
# only the CO2 column.
import pandas as pd
# First, we process the original dataframe to create a date column and select
# it along with the CO2 column.
import polars as pl

co2_data = co2.frame
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
co2_data = co2_data[["date", "co2"]].set_index("date")
co2_data = pl.DataFrame({col: co2.frame[col].to_numpy() for col in co2.frame.columns})
co2_data = co2_data.select(pl.date("year", "month", "day"), "co2")
co2_data.head()

# %%
co2_data.index.min(), co2_data.index.max()
co2_data["date"].min(), co2_data["date"].max()

# %%
# We see that we get CO2 concentration for some days from March, 1958 to
# December, 2001. We can plot these raw information to have a better
# understanding.
import matplotlib.pyplot as plt

co2_data.plot()
plt.plot(co2_data["date"], co2_data["co2"])
plt.xlabel("date")
plt.ylabel("CO$_2$ concentration (ppm)")
_ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")

Expand All @@ -67,15 +68,14 @@
# for which no measurements were collected. Such a processing will have an
# smoothing effect on the data.

try:
co2_data_resampled_monthly = co2_data.resample("ME")
except ValueError:
# pandas < 2.2 uses M instead of ME
co2_data_resampled_monthly = co2_data.resample("M")


co2_data = co2_data_resampled_monthly.mean().dropna(axis="index", how="any")
co2_data.plot()
co2_data = (
co2_data.sort(by="date")
.group_by_dynamic("date", every="1mo")
.agg(pl.col("co2").mean())
.drop_nulls()
)
plt.plot(co2_data["date"], co2_data["co2"])
plt.xlabel("date")
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
_ = plt.title(
"Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
Expand All @@ -88,7 +88,11 @@
#
# As a first step, we will divide the data and the target to estimate. The data
# being a date, we will convert it into a numeric.
X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
X = (
co2_data.select(pl.col("date").dt.year() + pl.col("date").dt.month() / 12)
.to_numpy()
.reshape(-1, 1)
)
y = co2_data["co2"].to_numpy()

# %%
Expand Down

0 comments on commit 77bdbd8

Please sign in to comment.