In [None]:
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, RationalQuadratic, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor

## Load Data

In [None]:
co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
co2.frame.head()

In [None]:
co2_data_raw = co2.frame
co2_data_raw["date"] = pd.to_datetime(co2_data_raw[["year", "month", "day"]])
co2_data_raw = co2_data_raw[["date", "co2"]].set_index("date")
co2_data_raw.head()

In [None]:
co2_data = co2_data_raw.resample("ME").mean().dropna(axis="index", how="any")

In [None]:
plt.plot(co2_data_raw.index, co2_data_raw['co2'], alpha=0.5, label='raw')
plt.plot(co2_data.index, co2_data['co2'], alpha=0.5, label='mean')
plt.title("Air samples measurements from the Mauna Loa Observatory")
plt.xlabel('Date')
plt.ylabel("CO$_2$ concentration (ppm)")
plt.legend()

In [None]:
X_raw = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
y_raw = co2_data["co2"].to_numpy()

In [None]:
n = int(0.8*X_raw.shape[0])
X_train = X_raw[:n]
y_train = y_raw[:n]
X_test = X_raw[n:]
y_test = y_raw[n:]

In [None]:
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)

## Define GP Kernel

In [None]:
long_term_trend_kernel = 50.0**2 * RBF(length_scale=50.0)

In [None]:
seasonal_kernel = (
    2.0**2
    * RBF(length_scale=100.0) # allows for decay away from periodicity 
    * ExpSineSquared(length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed") # seasonal variation
)

In [None]:
# small irregularities
irregularities_kernel = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0) 

In [None]:
# noise
noise_kernel = (
    0.1**2 * RBF(length_scale=0.1) + 
    WhiteKernel(noise_level=0.1**2, noise_level_bounds=(1e-5, 1e5))
)

In [None]:
co2_kernel = (
    noise_kernel + long_term_trend_kernel + seasonal_kernel + irregularities_kernel
)

## Fit GP

In [None]:
y_mean = y_train.mean()
gaussian_process = GaussianProcessRegressor(kernel=co2_kernel, normalize_y=False)
gaussian_process.fit(X_train, y_train - y_mean)

In [None]:
mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
mean_y_pred += y_mean

In [None]:
plt.plot(X_train, y_train, label="train")
plt.plot(X_test, y_test, label="test")
plt.plot(X_test, mean_y_pred, alpha=0.4, label="GP")
plt.fill_between(
    X_test.ravel(),
    mean_y_pred - std_y_pred,
    mean_y_pred + std_y_pred,
    color="C2",
    alpha=0.2,
)
plt.legend()
plt.xlabel("Year")
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
_ = plt.title(
    "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
)

In [None]:
gaussian_process.kernel_

Thus, most of the target signal, with the mean subtracted, is explained by a long-term rising trend for ~45 ppm and a length-scale of ~52 years. The periodic component has an amplitude of ~2.6ppm, a decay time of ~90 years and a length-scale of ~1.5. The long decay time indicates that we have a component very close to a seasonal periodicity. The correlated noise has an amplitude of ~0.2 ppm with a length scale of ~0.12 years and a white-noise contribution of ~0.04 ppm. Thus, the overall noise level is very small, indicating that the data can be very well explained by the model.

[Source - scikit learn](https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-auto-examples-gaussian-process-plot-gpr-co2-py)