# VO2max Prediction Using Treadmill Maximal Exercise Tests and Machine Learning Techniques

In [36]:
# import packages
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import signal
from collections import Counter
from sklearn.utils import indexable
from sklearn.utils import resample
from sklearn.utils.validation import _num_samples
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from itertools import chain
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, MultiTaskLassoCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from math import ceil, floor

In [37]:
# load subject dataset
subject_data = pd.read_csv('subject-info.csv')

# basic information
display(subject_data.head())
print("Initial Dataset Info:")
print(subject_data.info())
print(subject_data.describe())

Unnamed: 0,Age,Weight,Height,Humidity,Temperature,Sex,ID,ID_test
0,10.8,48.8,163.0,39.0,20.7,1,543,543_1
1,11.8,41.0,150.0,41.0,22.3,1,11,11_1
2,12.2,46.0,160.0,37.0,21.5,0,829,829_1
3,13.2,71.0,190.0,49.0,23.8,1,284,284_1
4,13.7,53.8,169.7,40.0,25.3,0,341,341_1


Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          992 non-null    float64
 1   Weight       992 non-null    float64
 2   Height       992 non-null    float64
 3   Humidity     962 non-null    float64
 4   Temperature  962 non-null    float64
 5   Sex          992 non-null    int64  
 6   ID           992 non-null    int64  
 7   ID_test      992 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 62.1+ KB
None
              Age      Weight      Height    Humidity  Temperature  \
count  992.000000  992.000000  992.000000  962.000000   962.000000   
mean    28.979133   73.383367  174.913508   48.211435    22.818565   
std     10.076653   12.005361    7.950027    8.560991     2.784066   
min     10.800000   41.000000  150.000000   23.700000    15.000000   
25%     21.100000   66.000000  170.0

In [38]:
# load measurement dataset
measurement_data = pd.read_csv('test_measure.csv')

# basic information
display(measurement_data.head())
print("Initial Dataset Info:")
print(measurement_data.info())
print(measurement_data.describe())

Unnamed: 0,time,Speed,HR,VO2,VCO2,RR,VE,ID_test,ID
0,0,5.0,63.0,478.0,360.0,27,13.3,2_1,2
1,2,5.0,75.0,401.0,295.0,23,10.3,2_1,2
2,4,5.0,82.0,449.0,319.0,29,12.2,2_1,2
3,7,5.0,87.0,461.0,340.0,28,12.8,2_1,2
4,9,5.0,92.0,574.0,417.0,28,14.6,2_1,2


Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575087 entries, 0 to 575086
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   time     575087 non-null  int64  
 1   Speed    575087 non-null  float64
 2   HR       574106 non-null  float64
 3   VO2      570216 non-null  float64
 4   VCO2     570216 non-null  float64
 5   RR       575087 non-null  int64  
 6   VE       575087 non-null  float64
 7   ID_test  575087 non-null  object 
 8   ID       575087 non-null  int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 39.5+ MB
None
                time          Speed             HR            VO2  \
count  575087.000000  575087.000000  574106.000000  570216.000000   
mean      628.126172       9.607958     146.940892    2313.617768   
std       325.588844       4.520384      32.206372     978.103888   
min         0.000000       0.000000       0.000000      -5.000000   
25%       375.000000      

In [39]:
# merge both datasets on ID
merged_data = pd.merge(measurement_data, subject_data, on='ID', how='inner')

display(merged_data.head())
print(merged_data.info())

Unnamed: 0,time,Speed,HR,VO2,VCO2,RR,VE,ID_test_x,ID,Age,Weight,Height,Humidity,Temperature,Sex,ID_test_y
0,0,5.0,63.0,478.0,360.0,27,13.3,2_1,2,33.8,68.0,171.1,,,0,2_1
1,2,5.0,75.0,401.0,295.0,23,10.3,2_1,2,33.8,68.0,171.1,,,0,2_1
2,4,5.0,82.0,449.0,319.0,29,12.2,2_1,2,33.8,68.0,171.1,,,0,2_1
3,7,5.0,87.0,461.0,340.0,28,12.8,2_1,2,33.8,68.0,171.1,,,0,2_1
4,9,5.0,92.0,574.0,417.0,28,14.6,2_1,2,33.8,68.0,171.1,,,0,2_1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773618 entries, 0 to 773617
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   time         773618 non-null  int64  
 1   Speed        773618 non-null  float64
 2   HR           772394 non-null  float64
 3   VO2          768747 non-null  float64
 4   VCO2         768747 non-null  float64
 5   RR           773618 non-null  int64  
 6   VE           773618 non-null  float64
 7   ID_test_x    773618 non-null  object 
 8   ID           773618 non-null  int64  
 9   Age          773618 non-null  float64
 10  Weight       773618 non-null  float64
 11  Height       773618 non-null  float64
 12  Humidity     753579 non-null  float64
 13  Temperature  753579 non-null  float64
 14  Sex          773618 non-null  int64  
 15  ID_test_y    773618 non-null  object 
dtypes: float64(10), int64(4), object(2)
memory usage: 94.4+ MB
None


## Feature Selection

The feature selection process is based on the Paper: ... .

### Data Cleaning and Pre-processing

In [40]:
# check if there are missing values
print(merged_data.isnull().sum())

time               0
Speed              0
HR              1224
VO2             4871
VCO2            4871
RR                 0
VE                 0
ID_test_x          0
ID                 0
Age                0
Weight             0
Height             0
Humidity       20039
Temperature    20039
Sex                0
ID_test_y          0
dtype: int64


In [41]:
# remove rows with missing target variable (HR, VO2)
merged_data_cleaned = merged_data.dropna(subset=['HR', 'VO2'])

# Check how many rows are removed
print(f'Rows removed due to missing HR or VO2: {len(merged_data) - len(merged_data_cleaned)}')

Rows removed due to missing HR or VO2: 6083


In [46]:
# remove RR intervals exclusive to the range 300-2000 ms
# those are considered as outliers
merged_data_cleaned['RR'] = np.where((merged_data_cleaned['RR'] < 300) | (merged_data_cleaned['RR'] > 2000), np.nan, merged_data_cleaned['RR'])

# linear interpolation to fill missing values
merged_data_cleaned['RR'] = merged_data_cleaned['RR'].interpolate(method='linear')

# Check for NaN values after interpolation
print(merged_data_cleaned.isnull().sum())

time                0
Speed               0
HR                  0
VO2                 0
VCO2                0
RR             767535
VE                  0
ID_test_x           0
ID                  0
Age                 0
Weight              0
Height              0
Humidity        19997
Temperature     19997
Sex                 0
ID_test_y           0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['RR'] = np.where((merged_data_cleaned['RR'] < 300) | (merged_data_cleaned['RR'] > 2000), np.nan, merged_data_cleaned['RR'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['RR'] = merged_data_cleaned['RR'].interpolate(method='linear')


In [49]:
# remove if HR and VO2max are out of phase
merged_data_cleaned = merged_data_cleaned[(merged_data_cleaned['HR'] > 0) & (merged_data_cleaned['VO2'] > 0)]

# remove where consecutive HR differs more by than 30 bpm_
merged_data_cleaned['HR_diff'] = merged_data_cleaned['HR'].diff().abs()
merged_data_cleaned = merged_data_cleaned[merged_data_cleaned['HR_diff'] <= 30]

# remove objects with less than 5 minutes of data
merged_data_cleaned['time'] = merged_data_cleaned['time'].astype(float)  # make it numeirc
merged_data_cleaned = merged_data_cleaned[merged_data_cleaned['time'] >= 300]

# check dataset
print(f'Dataset size after cleaning: {merged_data_cleaned.shape}')

Dataset size after cleaning: (622332, 17)


In [50]:
# split data into exercise and recovery phases
exercise_data = merged_data_cleaned[merged_data_cleaned['Speed'] >= 5]  # Exercise phase
recovery_data = merged_data_cleaned[merged_data_cleaned['Speed'] < 5]  # Recovery phase

# shapes of dataframes
print(f'Exercise data size: {exercise_data.shape}')
print(f'Recovery data size: {recovery_data.shape}')

Exercise data size: (573333, 17)
Recovery data size: (48999, 17)
