In [112]:
import pandas as pd
import numpy as np

cycling = pd.read_csv("../datasets/bike_rides.csv", index_col=0,
                      parse_dates=True)
cycling.index.name = ""
target_name = "power"
data, target = cycling.drop(columns=target_name), cycling[target_name]
data

Unnamed: 0,heart-rate,cadence,speed,acceleration,slope
,,,,,
2020-08-18 14:43:19,102.0,64.0,4.325,0.0880,-0.033870
2020-08-18 14:43:20,103.0,64.0,4.336,0.0842,-0.033571
2020-08-18 14:43:21,105.0,66.0,4.409,0.0234,-0.033223
2020-08-18 14:43:22,106.0,66.0,4.445,0.0016,-0.032908
2020-08-18 14:43:23,106.0,67.0,4.441,0.1144,0.000000
...,...,...,...,...,...
2020-09-13 14:55:57,130.0,0.0,1.054,0.0234,0.000000
2020-09-13 14:55:58,130.0,0.0,0.829,0.0258,0.000000
2020-09-13 14:55:59,129.0,0.0,0.616,-0.1686,0.000000


In [113]:
data_linear_model = pd.concat(
    {"speed^3": data["speed"] ** 3,
     "speed": data["speed"],
     "speed*sin(alpha)": data["speed"] * np.sin(np.arctan(data["slope"])),
     "speed*acceleration": data["speed"] * data["acceleration"].clip(lower=0)},
    axis=1
)
data_linear_model.mean()["speed*sin(alpha)"]

-0.0027097096125663312

In [114]:
data_linear_model

Unnamed: 0,speed^3,speed,speed*sin(alpha),speed*acceleration
,,,,
2020-08-18 14:43:19,80.901828,4.325,-0.146402,0.380600
2020-08-18 14:43:20,81.520685,4.336,-0.145482,0.365091
2020-08-18 14:43:21,85.707790,4.409,-0.146398,0.103171
2020-08-18 14:43:22,87.824421,4.445,-0.146198,0.007112
2020-08-18 14:43:23,87.587538,4.441,0.000000,0.508050
...,...,...,...,...
2020-09-13 14:55:57,1.170905,1.054,0.000000,0.024664
2020-09-13 14:55:58,0.569723,0.829,0.000000,0.021388
2020-09-13 14:55:59,0.233745,0.616,0.000000,0.000000


In [115]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

# create the model
linear_model = make_pipeline(StandardScaler(), RidgeCV())

# evaluate the model with cross-validation
cv = ShuffleSplit(n_splits=4, random_state=0)
cv_results_linear_model = cross_validate(
    linear_model, data_linear_model, target, cv=cv,
    scoring="neg_mean_absolute_error",
    return_estimator=True, return_train_score=True)
errors = -cv_results_linear_model["test_score"]
print(f"MAE on test sets:\n {errors}\n",
      f"mean +/- std: {errors.mean():.3f} +/- {errors.std():.3f} Watts")

MAE on test sets:
 [73.23006461 72.1311734  72.89061823 71.2370263 ]
 mean +/- std: 72.372 +/- 0.767 Watts


In [116]:
linear_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('ridgecv', RidgeCV(alphas=array([ 0.1,  1. , 10. ])))])

In [117]:
for model in cv_results_linear_model['estimator']:
    print(model[1].coef_)

[ 5.66427806 32.84904152 80.08105928 10.85618779]
[ 5.68897463 32.83434375 80.99005594 11.34425   ]
[ 6.28736152 32.19112942 80.92397865 11.23297157]
[ 6.18278112 32.4035743  80.61344415 11.27427489]


In [118]:
cv_results_linear_model['estimator'][0][1].coef_

array([ 5.66427806, 32.84904152, 80.08105928, 10.85618779])

In [119]:
from sklearn.ensemble import HistGradientBoostingRegressor

est = HistGradientBoostingRegressor(early_stopping=True, max_iter=1000)


In [120]:
cv = ShuffleSplit(n_splits=4, random_state=0)
cv_results_linear_model = cross_validate(
    est, data, target, cv=cv,
    scoring="neg_mean_absolute_error",
    return_estimator=True, return_train_score=True)
errors = -cv_results_linear_model["test_score"]
print(f"MAE on test sets:\n {errors}\n",
      f"mean +/- std: {errors.mean():.3f} +/- {errors.std():.3f} Watts")

MAE on test sets:
 [44.35647428 43.91321636 43.3975188  43.587429  ]
 mean +/- std: 43.814 +/- 0.364 Watts


In [121]:
np.unique(data.index.date)

array([datetime.date(2020, 8, 18), datetime.date(2020, 8, 20),
       datetime.date(2020, 8, 26), datetime.date(2020, 9, 13)],
      dtype=object)

In [124]:
from sklearn.ensemble import HistGradientBoostingRegressor

hgbdt = HistGradientBoostingRegressor(
    max_iter=1000, early_stopping=True, random_state=42
)

cv = ShuffleSplit(n_splits=4, random_state=0)
cv_results_hgbdt = cross_validate(
    hgbdt, data, target, cv=cv, scoring="neg_mean_absolute_error",
    return_estimator=True, return_train_score=True, n_jobs=2)

errors = -cv_results_hgbdt["test_score"]
print(f"MAE on test sets:\n {errors}\n",
      f"mean +/- std: {errors.mean():.3f} +/- {errors.std():.3f} Watts")

MAE on test sets:
 [44.39516221 44.13036203 43.75817    43.06094014]
 mean +/- std: 43.836 +/- 0.502 Watts


In [125]:
errors_SS_lm = -cv_results_linear_model["train_score"]
print(f"Linear model - MAE on train sets:\t",
      f"{errors_SS_lm.mean():.3f} +/- {errors_SS_lm.std():.3f} Watts")
errors_SS_lm = -cv_results_linear_model["test_score"]
print(f"Linear model - MAE on test sets:\t",
      f"{errors_SS_lm.mean():.3f} +/- {errors_SS_lm.std():.3f} Watts")

errors_SS_hgbdt = -cv_results_hgbdt["train_score"]
print(f"Histogram GBDT - MAE on train sets:\t",
      f"{errors_SS_hgbdt.mean():.3f} +/- {errors_SS_hgbdt.std():.3f} Watts")
errors_SS_hgbdt = -cv_results_hgbdt["test_score"]
print(f"Histogram GBDT - MAE on test sets:\t",
      f"{errors_SS_hgbdt.mean():.3f} +/- {errors_SS_hgbdt.std():.3f} Watts")

Linear model - MAE on train sets:	 39.907 +/- 1.239 Watts
Linear model - MAE on test sets:	 43.814 +/- 0.364 Watts
Histogram GBDT - MAE on train sets:	 40.561 +/- 0.686 Watts
Histogram GBDT - MAE on test sets:	 43.836 +/- 0.502 Watts


In [126]:
import numpy as np

unique_ride_dates = np.unique(cycling.index.date)
print(f"There are {len(unique_ride_dates)} bike rides")

There are 4 bike rides


In [127]:
from sklearn.model_selection import LeaveOneGroupOut

groups, _ = pd.factorize(data_linear_model.index.date)
cv = LeaveOneGroupOut()

cv_results_linear_model = cross_validate(
    linear_model, data_linear_model, target, groups=groups, cv=cv,
    scoring="neg_mean_absolute_error", return_estimator=True,
    return_train_score=True, n_jobs=2)
cv_results_hgbdt = cross_validate(
    hgbdt, data, target, groups=groups, cv=cv,
    scoring="neg_mean_absolute_error", return_estimator=True,
    return_train_score=True, n_jobs=2)

errors_LOGO_lm = -cv_results_linear_model["train_score"]
print(f"Linear model - MAE on train sets:\t",
      f"{errors_LOGO_lm.mean():.3f} +/- {errors_LOGO_lm.std():.3f} Watts")
errors_LOGO_lm = -cv_results_linear_model["test_score"]
print(f"Linear model - MAE on test sets:\t",
      f"{errors_LOGO_lm.mean():.3f} +/- {errors_LOGO_lm.std():.3f} Watts")

errors_LOGO_hgbdt = -cv_results_hgbdt["train_score"]
print(f"Histogram GBDT - MAE on train sets:\t",
      f"{errors_LOGO_hgbdt.mean():.3f} +/- {errors_LOGO_hgbdt.std():.3f} Watts")
errors_LOGO_hgbdt = -cv_results_hgbdt["test_score"]
print(f"Histogram GBDT - MAE on test sets:\t",
      f"{errors_LOGO_hgbdt.mean():.3f} +/- {errors_LOGO_hgbdt.std():.3f} Watts")

Linear model - MAE on train sets:	 72.249 +/- 2.247 Watts
Linear model - MAE on test sets:	 73.015 +/- 5.779 Watts
Histogram GBDT - MAE on train sets:	 40.131 +/- 1.295 Watts
Histogram GBDT - MAE on test sets:	 49.204 +/- 2.694 Watts


In [128]:
print(
    "LM with LeaveOneGroupOut has a bigger test error than LM with ShuffleSplit by "
    f"{errors_LOGO_lm.mean() - errors_SS_lm.mean()}"
    " Watts."
)

LM with LeaveOneGroupOut has a bigger test error than LM with ShuffleSplit by 29.201115640096738 Watts.


In [129]:
print(
    "HGBDT with LeaveOneGroupOut has a bigger test error than HGBDT with ShuffleSplit by "
    f"{errors_LOGO_hgbdt.mean() - errors_SS_hgbdt.mean()}"
    " Watts."
)

HGBDT with LeaveOneGroupOut has a bigger test error than HGBDT with ShuffleSplit by 5.367504740249601 Watts.


In [130]:
cv = LeaveOneGroupOut()
train_indices, test_indices = list(cv.split(data, target, groups=groups))[0]

data_linear_model_train = data_linear_model.iloc[train_indices]
data_linear_model_test = data_linear_model.iloc[test_indices]

data_train = data.iloc[train_indices]
data_test = data.iloc[test_indices]

target_train = target.iloc[train_indices]
target_test = target.iloc[test_indices]

In [None]:
import matplotlib.pyplot as plt
