# VO2max Prediction Using Treadmill Maximal Exercise Tests and Machine Learning Techniques

In [1]:
# import packages
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import signal
from collections import Counter
from sklearn.utils import indexable
from sklearn.utils import resample
from sklearn.utils.validation import _num_samples
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from itertools import chain
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, MultiTaskLassoCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from math import ceil, floor

In [2]:
# load subject dataset
subject_data = pd.read_csv('physionet.org/files/treadmill-exercise-cardioresp/1.0.1/subject-info.csv')

# basic information
display(subject_data.head())
print("Initial Dataset Info:")
print(subject_data.info())
print(subject_data.describe())

Unnamed: 0,Age,Weight,Height,Humidity,Temperature,Sex,ID,ID_test
0,10.8,48.8,163.0,39.0,20.7,1,543,543_1
1,11.8,41.0,150.0,41.0,22.3,1,11,11_1
2,12.2,46.0,160.0,37.0,21.5,0,829,829_1
3,13.2,71.0,190.0,49.0,23.8,1,284,284_1
4,13.7,53.8,169.7,40.0,25.3,0,341,341_1


Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          992 non-null    float64
 1   Weight       992 non-null    float64
 2   Height       992 non-null    float64
 3   Humidity     962 non-null    float64
 4   Temperature  962 non-null    float64
 5   Sex          992 non-null    int64  
 6   ID           992 non-null    int64  
 7   ID_test      992 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 62.1+ KB
None
              Age      Weight      Height    Humidity  Temperature  \
count  992.000000  992.000000  992.000000  962.000000   962.000000   
mean    28.979133   73.383367  174.913508   48.211435    22.818565   
std     10.076653   12.005361    7.950027    8.560991     2.784066   
min     10.800000   41.000000  150.000000   23.700000    15.000000   
25%     21.100000   66.000000  170.0

In [76]:
# load measurement dataset
measurement_data = pd.read_csv('physionet.org/files/treadmill-exercise-cardioresp/1.0.1/test_measure.csv')

# basic information
measurement_data

Unnamed: 0,time,Speed,HR,VO2,VCO2,RR,VE,ID_test,ID
0,0,5.0,63.0,478.0,360.0,27,13.3,2_1,2
1,2,5.0,75.0,401.0,295.0,23,10.3,2_1,2
2,4,5.0,82.0,449.0,319.0,29,12.2,2_1,2
3,7,5.0,87.0,461.0,340.0,28,12.8,2_1,2
4,9,5.0,92.0,574.0,417.0,28,14.6,2_1,2
...,...,...,...,...,...,...,...,...,...
575082,926,5.0,148.0,1350.0,1699.0,35,65.7,857_1,857
575083,927,5.0,147.0,1275.0,1605.0,32,60.1,857_1,857
575084,929,5.0,147.0,1259.0,1566.0,29,57.0,857_1,857
575085,931,5.0,147.0,1278.0,1587.0,31,58.5,857_1,857


In [144]:
# merge both datasets on ID
#merged_data = pd.merge(measurement_data, subject_data, on='ID', how='inner')
# merged_data = measurement_data
merged_data

Unnamed: 0,time,Speed,HR,VO2,VCO2,RR,VE,ID_test_x,ID,Age,Weight,Height,Humidity,Temperature,Sex,ID_test_y
0,0,5.0,63.0,478.0,360.0,27,13.3,2_1,2,33.8,68.0,171.1,,,0,2_1
1,2,5.0,75.0,401.0,295.0,23,10.3,2_1,2,33.8,68.0,171.1,,,0,2_1
2,4,5.0,82.0,449.0,319.0,29,12.2,2_1,2,33.8,68.0,171.1,,,0,2_1
3,7,5.0,87.0,461.0,340.0,28,12.8,2_1,2,33.8,68.0,171.1,,,0,2_1
4,9,5.0,92.0,574.0,417.0,28,14.6,2_1,2,33.8,68.0,171.1,,,0,2_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773613,926,5.0,148.0,1350.0,1699.0,35,65.7,857_1,857,29.0,80.1,188.0,53.0,26.3,0,857_1
773614,927,5.0,147.0,1275.0,1605.0,32,60.1,857_1,857,29.0,80.1,188.0,53.0,26.3,0,857_1
773615,929,5.0,147.0,1259.0,1566.0,29,57.0,857_1,857,29.0,80.1,188.0,53.0,26.3,0,857_1
773616,931,5.0,147.0,1278.0,1587.0,31,58.5,857_1,857,29.0,80.1,188.0,53.0,26.3,0,857_1


## Feature Selection

The feature selection process is based on the Paper: ... .

### Data Cleaning and Pre-processing

In [145]:
# check if there are missing values
print(merged_data.isnull().sum())

time               0
Speed              0
HR              1224
VO2             4871
VCO2            4871
RR                 0
VE                 0
ID_test_x          0
ID                 0
Age                0
Weight             0
Height             0
Humidity       20039
Temperature    20039
Sex                0
ID_test_y          0
dtype: int64


In [146]:
# remove rows with missing target variable (HR, VO2)
merged_data_cleaned = merged_data.dropna(subset=['HR', 'VO2'])

# Check how many rows are removed
print(f'Rows removed due to missing HR or VO2: {len(merged_data) - len(merged_data_cleaned)}')

Rows removed due to missing HR or VO2: 6083


In [147]:
print("Exercise Data RR Intervals:")
print(merged_data_cleaned['RR'])

Exercise Data RR Intervals:
0         27
1         23
2         29
3         28
4         28
          ..
773613    35
773614    32
773615    29
773616    31
773617    31
Name: RR, Length: 767535, dtype: int64


In [148]:
# remove RR intervals exclusive to the range 300-2000 ms
# those are considered as outliers
# Entferne RR-Werte, die außerhalb des Bereichs von 5-50 Atemzügen pro Minute liegen (Ausreißer)
merged_data_cleaned['RR'] = np.where((merged_data_cleaned['RR'] < 5) | (merged_data_cleaned['RR'] > 50), np.nan, merged_data_cleaned['RR'])

# linear interpolation to fill missing values
merged_data_cleaned['RR'] = merged_data_cleaned['RR'].interpolate(method='linear')

# Check for NaN values after interpolation
print(merged_data_cleaned.isnull().sum())

time               0
Speed              0
HR                 0
VO2                0
VCO2               0
RR                 0
VE                 0
ID_test_x          0
ID                 0
Age                0
Weight             0
Height             0
Humidity       19997
Temperature    19997
Sex                0
ID_test_y          0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['RR'] = np.where((merged_data_cleaned['RR'] < 5) | (merged_data_cleaned['RR'] > 50), np.nan, merged_data_cleaned['RR'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_cleaned['RR'] = merged_data_cleaned['RR'].interpolate(method='linear')


In [149]:
# remove if HR and VO2max are out of phase
merged_data_cleaned = merged_data_cleaned[(merged_data_cleaned['HR'] > 0) & (merged_data_cleaned['VO2'] > 0)]

# remove where consecutive HR differs more by than 30 bpm_
merged_data_cleaned['HR_diff'] = merged_data_cleaned['HR'].diff().abs()
merged_data_cleaned = merged_data_cleaned[merged_data_cleaned['HR_diff'] <= 30]

# remove objects with less than 5 minutes of data
merged_data_cleaned['time'] = merged_data_cleaned['time'].astype(float)  # make it numeirc
merged_data_cleaned = merged_data_cleaned[merged_data_cleaned['time'] >= 300]

# check dataset
print(f'Dataset size after cleaning: {merged_data_cleaned.shape}')

Dataset size after cleaning: (622931, 17)


In [150]:
# split data into exercise and recovery phases
exercise_data = merged_data_cleaned[merged_data_cleaned['Speed'] >= 5]  # Exercise phase
recovery_data = merged_data_cleaned[merged_data_cleaned['Speed'] < 5]  # Recovery phase

# shapes of dataframes
print(f'Exercise data size: {exercise_data.shape}')
print(f'Recovery data size: {recovery_data.shape}')

Exercise data size: (573861, 17)
Recovery data size: (49070, 17)


In [151]:
rr_intervals = merged_data_cleaned['RR'].values  # Annahme: RR-Intervalle liegen als numpy-Array oder pandas-Serie vor

# Überprüfe auf NaN/Inf-Werte und bereinige die Daten
rr_intervals = rr_intervals[np.isfinite(rr_intervals)]
    


## Feature Extraction

In [152]:
from scipy import stats
from scipy.signal import detrend

In [153]:
# auch mit chat GPT lol

def calculate_slopes(df, feature):
    slope = linregress(df['time'], df[feature]).slope
    return slope

# all HR Features

def compute_hr_features(segment: pd.DataFrame):
    """Berechnet HR-Features für einen gegebenen Segment-Datensatz."""
    hr = segment['HR'].values
    time = segment['time'].values
    
    # Extrahiere die RR-Intervalle aus dem Segment
    rr_intervals = segment['RR'].values  # Annahme: RR-Intervalle liegen als numpy-Array oder pandas-Serie vor

    # Überprüfe auf NaN/Inf-Werte und bereinige die Daten
    rr_intervals = rr_intervals[np.isfinite(rr_intervals)]
    if len(rr_intervals) == 0:
        raise ValueError("Alle RR-Intervall-Daten wurden aufgrund von NaN/Inf-Werten entfernt.")

    # Zeit-Domain-Features
    rmssd = np.sqrt(np.mean(np.diff(rr_intervals) ** 2))  # Berechnung von RMSSD
    pnn20 = np.mean(np.abs(np.diff(rr_intervals)) > 20) * 100  # Prozentsatz der Differenzen > 20 ms
    pnn50 = np.mean(np.abs(np.diff(rr_intervals)) > 50) * 100  # Prozentsatz der Differenzen > 50 ms
    nni20 = np.sum(np.abs(np.diff(rr_intervals)) > 20)      # Anzahl der Differenzen > 20 ms
    nni50 = np.sum(np.abs(np.diff(rr_intervals)) > 50)      # Anzahl der Differenzen > 50 ms
    nni_range = np.max(rr_intervals) - np.min(rr_intervals)  # Bereich zwischen max. und min. RR-Intervall

    # Frequenz-Domain-Features
    
    # total_power = np.var(detrend(rr_intervals))  # Gesamtvarianz nach Detrend
    # vlf_power = total_power * 0.1  # Dummy-Verhältnis für VLF-Power (simuliert)
    # lf_power = total_power * 0.3  # Dummy-Verhältnis für LF-Power (simuliert)
    # hf_power = total_power * 0.6  # Dummy-Verhältnis für HF-Power (simuliert)
    
    fs = 4  # Annahme: Sampling-Frequenz von 4 Hz
    f, pxx = welch(rr_intervals, fs, nperseg=len(rr_intervals))
    vlf_power = np.trapz(pxx[(f >= 0.003) & (f < 0.04)], f[(f >= 0.003) & (f < 0.04)])
    lf_power = np.trapz(pxx[(f >= 0.04) & (f < 0.15)], f[(f >= 0.04) & (f < 0.15)])
    hf_power = np.trapz(pxx[(f >= 0.15) & (f < 0.4)], f[(f >= 0.15) & (f < 0.4)])
    lf_nu_power = lf_power / (vlf_power + lf_power + hf_power)
    #  #welches von den beiden sd1 und sd2 ist richtig?
    sd1 = np.std(rr_intervals[:len(rr_intervals)//2])  # Kurzfristige Variation (SD1)
    sd2 = np.std(rr_intervals[len(rr_intervals)//2:])  # Langfristige Variation (SD2)
    print(sd1, sd2)
    # Nichtlineare Merkmale (Poincare Plot, DFA)
    # Für die Poincare-Plot-Merkmale verwenden wir die Standardabweichungen
    sd1 = np.std(hr[:len(hr)//2])
    sd2 = np.std(hr[len(hr)//2:])
    print(sd1, sd2)
    
    # DFA (Detrended Fluctuation Analysis) als Platzhalter (echte Implementierung erfordert spezialisierte Bibliotheken)
    # Dummy-Wert für DFA a1 (z.B. von einer DFA-Analyse)
    #dfa_a1 = 0.75  # Beispielwert für DFA a1
    dfa_a1 = np.std(rr_intervals[:len(rr_intervals)//2]) / np.std(rr_intervals) # placeholder
    

    # Berechnung der Steigungen der Merkmale über die Zeit
    # Beispiel: Steigung des HRs über Zeit
    #slope_nni = np.polyfit(time, hr, 1)[0]
    
    # # Da vlf_power, lf_power und hf_power konstante Werte sind, berechnen wir ihre Steigung nicht
    # slope_vlf = np.nan
    # slope_lf = np.nan
    # slope_hf = np.nan
    
    # # Steigungen für Poincare Plot-Merkmale
    # slope_sd1 = np.polyfit(time, [sd1] * len(time), 1)[0]
    # slope_sd2 = np.polyfit(time, [sd2] * len(time), 1)[0]

    # Zeitbasierte Merkmale (Beispiel)
    max_speed = segment['Speed'].max()
    # time_to_25_speed = segment[segment['Speed'] >= 0.25 * max_speed]['time'].iloc[0]
    # time_to_25_hr = segment[segment['HR'] >= 0.25 * max(segment['HR'])]['time'].iloc[0]
    
    max_hr = max(segment['HR'])
    min_hr = min(segment['HR'])
    
    # slope_sp25 = calculate_slopes(segment[segment['Speed'] <= max_speed * 0.25], 'HR')
    # slope_sp50 = calculate_slopes(segment[(segment['Speed'] <= max_speed * 0.5) & (segment['Speed'] > max_speed * 0.25)], 'HR')
    # slope_sp75 = calculate_slopes(segment[(segment['Speed'] <= max_speed * 0.75) & (segment['Speed'] > max_speed * 0.5)], 'HR')
    # slope_sp100 = calculate_slopes(segment[(segment['Speed'] > max_speed * 0.75)], 'HR')
    rt25 = segment.loc[segment['time'] <= segment['time'].max() * 0.25, 'HR'].corr(segment['time'])
    rt50 = segment.loc[(segment['time'] <= segment['time'].max() * 0.25) & (segment['time'] < segment['time'].max() * 0.5), 'HR'].corr(segment['time'])
    rt75 = segment.loc[(segment['time'] <= segment['time'].max() * 0.5) & (segment['time'] < segment['time'].max() * 0.75), 'HR'].corr(segment['time'])
    rsp75 = segment.loc[segment['Speed'] <= max_speed * 0.75, 'HR'].corr(segment['Speed'])
    rsp25 = segment.loc[segment['Speed'] <= max_speed * 0.25, 'HR'].corr(segment['Speed'])
    time_sp25 = segment[segment['Speed'] >= max_speed * 0.25]['time'].min()
    time_hr25 = segment[segment['HR'] >= max_hr * 0.25]['time'].min()
    time_hr50 = segment[segment['HR'] >= max_hr * 0.5]['time'].min()
    time_hr75 = segment[segment['HR'] >= max_hr * 0.75]['time'].min()
    duration_hr60 = len(segment[(segment['HR'] >= max_hr * 0.5) & (segment['HR'] < max_hr * 0.6)]) * (segment['time'].diff().mean())
    duration_hr70 = len(segment[(segment['HR'] >= max_hr * 0.6) & (segment['HR'] < max_hr * 0.7)]) * (segment['time'].diff().mean())
    duration_hr80 = len(segment[(segment['HR'] >= max_hr * 0.7) & (segment['HR'] < max_hr * 0.8)]) * (segment['time'].diff().mean())
    duration_hr90 = len(segment[(segment['HR'] >= max_hr * 0.8) & (segment['HR'] < max_hr * 0.9)]) * (segment['time'].diff().mean())
    duration_hr100 = len(segment[(segment['HR'] >= max_hr * 0.9) & (segment['HR'] <= max_hr)]) * (segment['time'].diff().mean())

    
    # Feature Dictionary
    Features = {'RMSSD': rmssd,
        'PNN20': pnn20,
        'PNN50': pnn50,
        'NNI20': nni20,
        'NNI50': nni50,
        'NNI_Range': nni_range,
        'VLF_Power': vlf_power,
        'LF_Power': lf_power,
        'HF_Power': hf_power,
        'LF_Nu_Power': lf_nu_power,
        'SD1': sd1,
        'SD2': sd2,
        'DFA_A1': dfa_a1,
        #'Slope_NNI': slope_nni,
        # 'Slope_Speed25': slope_sp25,
        # 'Slope_Speed50': slope_sp50,
        # 'Slope_Speed75': slope_sp75,
        # 'Slope_Speed100': slope_sp100,
        'RT25': rt25,
        'RT50': rt50,
        'RT75': rt75,
        'RSP75': rsp75,
        'RSP25': rsp25,
        'Time_SP25': time_sp25,
        'Time_HR25': time_hr25,
        'Time_HR50': time_hr50,
        'Time_HR75': time_hr75,
        'DurationHR60': duration_hr60,
        'DurationHR70': duration_hr70,
        'DurationHR80': duration_hr80,
        'DurationHR90': duration_hr90,
        'DurationHR100': duration_hr100
    }
    
    #feature_df = pd.DataFrame(Features, index=[0])
    return Features
    



In [178]:
def calculate_features_per_segment(df, test_column):
    """
    Berechnet Features für jedes Segment basierend auf der test_ID.
    
    Parameters:
    df (pd.DataFrame): Der DataFrame, der die Daten enthält.
    test_column (str): Der Name der Spalte, die die Test-IDs enthält.
    
    Returns:
    pd.DataFrame: Ein DataFrame mit den berechneten Features für jedes Segment.
    """
    # Leere Liste für die Features pro Segment
    all_features = []

    # Gehe durch jede Gruppe (Segment) basierend auf der test_ID
    for test_id, df_segment in df.groupby(test_column):
        # Berechne die Features für das aktuelle Segment
        features = compute_hr_features(df_segment)
        # Füge die Test-ID zu den Features hinzu
        features[test_column] = test_id
        # Hänge die Features für dieses Segment an die Liste an
        all_features.append(features)

    # Erstelle einen DataFrame aus der Liste der Features
    #df_features = pd.DataFrame(all_features)
    
    return all_features

#new_features = calculate_features_per_segment(merged_data_cleaned, 'ID_test_y')

In [179]:
new_features = pd.DataFrame(new_features)
new_features['ID_test'] = new_features['ID_test_y']
new_features

Unnamed: 0,RMSSD,PNN20,PNN50,NNI20,NNI50,NNI_Range,VLF_Power,LF_Power,HF_Power,LF_Nu_Power,...,Time_HR25,Time_HR50,Time_HR75,DurationHR60,DurationHR70,DurationHR80,DurationHR90,DurationHR100,ID_test_y,ID_test
0,1.481071,0.0,0.0,0,0,24.0,7.277623,4.303086,2.771932,0.299811,...,301.0,301.0,361.0,23.796499,80.908096,103.118162,174.507659,341.083151,100_1,100_1
1,1.153176,0.0,0.0,0,0,27.0,13.721111,2.372173,0.633813,0.141816,...,300.0,300.0,306.0,0.000000,0.000000,110.796834,338.121372,276.992084,101_1,101_1
2,0.973329,0.0,0.0,0,0,31.0,23.083816,1.677281,1.060094,0.064958,...,300.0,300.0,300.0,0.000000,0.000000,84.000000,164.000000,324.000000,102_1,102_1
3,1.505314,0.0,0.0,0,0,20.0,8.501543,0.682572,0.598572,0.069774,...,300.0,300.0,300.0,0.000000,49.621622,51.000000,188.837838,272.918919,103_1,103_1
4,1.300821,0.0,0.0,0,0,29.0,10.853108,1.639056,1.747539,0.115105,...,301.0,301.0,303.0,0.000000,56.831461,155.339326,339.094382,293.629213,104_1,104_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,1.757981,0.0,0.0,0,0,32.0,58.450279,1.908188,1.116328,0.031040,...,301.0,301.0,490.0,18.407240,243.384615,169.755656,145.212670,329.285068,98_1,98_1
977,1.122955,0.0,0.0,0,0,33.0,49.346644,3.473053,2.165212,0.063164,...,300.0,300.0,300.0,25.696721,66.811475,102.215847,255.825137,177.021858,99_1,99_1
978,1.122946,0.0,0.0,0,0,33.0,49.341144,3.472413,2.164424,0.063160,...,300.0,300.0,300.0,25.696721,66.811475,102.215847,255.825137,177.021858,99_47,99_47
979,1.122955,0.0,0.0,0,0,33.0,49.335655,3.471803,2.163677,0.063157,...,300.0,300.0,300.0,25.696721,66.811475,102.215847,255.825137,177.021858,99_54,99_54


In [177]:
merged_data = pd.merge(subject_data, new_features, on='ID_test', how='inner')
merged_data
# save the merged data
merged_data.to_csv('merged_data.csv')