In [2]:
import pandas as pd
import numpy as np

# Load the data (have to change the name for each sport
cycling_path = 'clean_data_cycling.csv'
strength_path = 'clean_data_strength.csv'
elyptic_path = 'clean_data_elyptic.csv'

cycling_data = pd.read_csv(cycling_path)
strength_data = pd.read_csv(strength_path)
elyptic_data = pd.read_csv(elyptic_path)

In [23]:
data = pd.concat([cycling_data, strength_data, elyptic_data], axis=0, ignore_index=True)
data

Unnamed: 0,time,rotationRateX,rotationRateY,rotationRateZ,gravityX,gravityY,gravityZ,accelerationX,accelerationY,accelerationZ,...,quaternionY,quaternionZ,z_gyro,y_gyro,x_gyro,z_acc,y_acc,x_acc,bpm,sport
0,171829243429,-0.246384,-0.081519,0.110493,0.317334,0.813489,-0.487376,-0.042294,-0.031018,0.024852,...,-0.437900,0.852192,-0.138021,-0.582045,-0.219524,-0.243557,-0.577762,-0.190925,105.0,1
1,171829243434,-0.138480,-0.123311,0.156557,0.319853,0.817145,-0.479550,-0.028440,0.049691,-0.053638,...,-0.442362,0.850672,-0.203224,-0.220469,-0.020842,-0.788626,-0.268524,0.834922,105.0,1
2,171829243439,-0.078834,-0.117711,0.288981,0.324156,0.815928,-0.478733,-0.063689,-0.070124,-0.042721,...,-0.443388,0.851395,-0.172186,-0.185771,0.024884,-0.398124,-0.406848,0.431240,105.0,1
3,171829243444,0.135324,-0.119591,0.278078,0.331465,0.810568,-0.482816,-0.048292,0.014734,-0.036853,...,-0.441038,0.853461,-0.156555,-0.071315,0.144891,-0.001588,-0.409412,-0.190077,105.0,1
4,171829243449,0.171445,-0.084271,0.341000,0.341875,0.801126,-0.491243,0.001311,0.071738,-0.023100,...,-0.435520,0.856728,-0.313300,0.043542,0.165136,0.145781,-0.103795,-0.076771,105.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94674,171831230601,-0.048556,-0.015872,0.065269,-0.096203,0.884213,-0.457069,0.000530,-0.030408,0.002205,...,-0.148407,0.153197,-0.040254,0.185446,-0.113402,-0.374216,0.188504,-0.186484,160.0,3
94675,171831230606,0.147562,-0.068622,0.042911,-0.095310,0.883170,-0.459267,0.018817,0.034326,0.003167,...,-0.148382,0.154511,-0.007232,-0.051257,0.150439,0.158708,0.182139,0.025271,160.0,3
94676,171831230611,-0.053198,-0.030381,0.020455,-0.095181,0.882081,-0.461383,-0.004535,-0.012269,-0.014447,...,-0.148771,0.155782,0.081558,0.102681,0.044035,-0.121265,0.281203,-0.067331,160.0,3
94677,171831230616,-0.084135,-0.054557,0.009571,-0.094891,0.883654,-0.458422,0.006162,-0.007907,0.002276,...,-0.149613,0.156787,0.099620,-0.103260,0.035884,0.277111,-0.019610,-0.156452,160.0,3


Feature Engineering

In [19]:
# Mean with windows
def mean_features(df, window_size):
    feature_df = pd.DataFrame()
    for col in df.columns:
        if col not in ['time', 'sport']:
            feature_df[col + f'_mean{window_size}'] = df[col].rolling(window=window_size).mean()
    return feature_df

# Example usage
# Assuming df is your DataFrame

window5 = mean_features(data, 5)
window20 = mean_features(data, 20)
window100 = mean_features(data, 100)


In [32]:
# Frequency domain
from scipy.fftpack import fft
from scipy.signal import welch
import numpy as np

def frequency_features(df, window_size, step_size, fs=1.0):
    """
    Extract frequency domain features for each sliding window.

    Parameters:
    - df: pandas DataFrame with the data
    - window_size: size of each window
    - step_size: step size between windows
    - fs: sampling frequency (default 1.0)

    Returns:
    - feature_df: DataFrame with frequency features for each window
    """
    feature_list = []

    for start in range(0, len(df) - window_size + 1, step_size):
        window_df = df.iloc[start:start + window_size]
        feature_dict = {}

        for col in df.columns:
            if col not in ['time', 'seconds_elapsed', 'seconds_elapsed_gyro', 'seconds_elapsed_acc', 'seconds_elapsed_hr']:
                col_data = window_df[col].dropna()
                if len(col_data) > 0:
                    freqs, psd = welch(col_data, fs=fs, nperseg=min(window_size, len(col_data)))
                    dominant_freq = freqs[np.argmax(psd)]
                    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10)) / np.log2(len(psd))
                    feature_dict[col + '_dominant_freq'] = dominant_freq
                    feature_dict[col + '_spectral_entropy'] = spectral_entropy

        feature_list.append(feature_dict)

    feature_df = pd.DataFrame(feature_list)
    return feature_df


frequency_features_df = frequency_features(data, 50,100)
frequency_features_df

Unnamed: 0,rotationRateX_dominant_freq,rotationRateX_spectral_entropy,rotationRateY_dominant_freq,rotationRateY_spectral_entropy,rotationRateZ_dominant_freq,rotationRateZ_spectral_entropy,gravityX_dominant_freq,gravityX_spectral_entropy,gravityY_dominant_freq,gravityY_spectral_entropy,...,z_acc_dominant_freq,z_acc_spectral_entropy,y_acc_dominant_freq,y_acc_spectral_entropy,x_acc_dominant_freq,x_acc_spectral_entropy,bpm_dominant_freq,bpm_spectral_entropy,sport_dominant_freq,sport_spectral_entropy
0,0.02,1.250353,0.04,1.065136,0.02,0.024231,0.02,0.170653,0.02,0.295288,...,0.22,1.883183,0.12,1.273326,0.20,0.807572,0.00,-0.000000,0.0,-0.0
1,0.10,0.705878,0.02,0.554046,0.04,0.574681,0.02,0.025264,0.02,0.007683,...,0.02,1.822456,0.02,0.463522,0.02,0.556914,0.02,0.485121,0.0,-0.0
2,0.06,-0.033529,0.16,1.871516,0.06,-0.287566,0.02,0.198046,0.06,0.092824,...,0.06,-683.788184,0.06,-281.356606,0.16,-18.664952,0.00,-0.000000,0.0,-0.0
3,0.08,-3.487442,0.40,1.871945,0.06,-5.970553,0.06,0.146250,0.02,0.335014,...,0.06,-1161.631009,0.12,-242.543217,0.18,-51.783547,0.00,-0.000000,0.0,-0.0
4,0.06,0.377963,0.44,1.937784,0.06,-4.850922,0.06,0.156143,0.06,0.104842,...,0.06,-1980.062443,0.14,-381.499060,0.20,-56.372320,0.00,-0.000000,0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,0.12,-1.153989,0.06,-8.419217,0.06,-88.480863,0.06,0.395787,0.06,0.021286,...,0.06,-2200.370394,0.06,-1430.809332,0.16,-1780.939851,0.00,-0.000000,0.0,-0.0
943,0.10,1.242576,0.06,1.229809,0.06,-111.158361,0.06,0.338019,0.02,0.029037,...,0.06,-1640.153441,0.06,-539.474522,0.16,-865.478046,0.02,-1.109190,0.0,-0.0
944,0.06,-11.186591,0.06,-3.412632,0.06,-83.674915,0.06,0.394511,0.06,0.070496,...,0.06,-1789.478516,0.06,-1079.360617,0.16,-1843.220415,0.02,-3.369145,0.0,-0.0
945,0.06,-32.752915,0.10,-0.465000,0.06,-105.642170,0.06,0.383586,0.04,0.114503,...,0.06,-1605.472369,0.06,-836.457229,0.16,-694.014513,0.02,-3.613189,0.0,-0.0


In [17]:
data = pd.concat([data, window5, window20, window100,frequency_features_df], axis=1, ignore_index=True)
data=data.dropna()
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,117,118,119,120,121,122,123,124,125,126


In [35]:
import pandas as pd
import numpy as np
from scipy.signal import welch

def interpolate_features(feature_df, original_length, window_size, step_size):
    """
    Interpolate features to match the length of the original DataFrame.

    Parameters:
    - feature_df: DataFrame with frequency features for each rolling window
    - original_length: length of the original DataFrame
    - window_size: size of each window
    - step_size: step size for rolling computation

    Returns:
    - interpolated_df: DataFrame with interpolated features
    """
    interpolated_features = []

    for i in range(0, original_length, step_size):
        start_idx = i
        end_idx = min(i + window_size, original_length)
        window_features = feature_df.iloc[i // step_size].copy()

        # Interpolate features to match the length of the original DataFrame
        window_features.index = [f"{col}_{start_idx}" for col in window_features.index]
        window_features = window_features.reindex([f"{col}_{j}" for col in window_features.index for j in range(start_idx, end_idx)])
        window_features.interpolate(method='linear', inplace=True)
        interpolated_features.append(window_features)

    interpolated_df = pd.DataFrame(interpolated_features)
    return interpolated_df

def rolling_frequency_features(df, window_size, step_size, fs=1.0):
    """
    Extract frequency domain features on a rolling basis.

    Parameters:
    - df: pandas DataFrame with the data
    - window_size: size of each window
    - step_size: step size for rolling computation
    - fs: sampling frequency (default 1.0)

    Returns:
    - feature_df: DataFrame with frequency features for each rolling window
    """
    feature_list = []

    for end in range(window_size, len(df) + 1, step_size):
        window_df = df.iloc[max(0, end - window_size):end]
        feature_dict = {}

        for col in df.columns:
            if col not in ['time', 'seconds_elapsed', 'seconds_elapsed_gyro', 'seconds_elapsed_acc', 'seconds_elapsed_hr']:
                col_data = window_df[col].dropna()
                if len(col_data) > 0:
                    freqs, psd = welch(col_data, fs=fs, nperseg=min(window_size, len(col_data)))
                    dominant_freq = freqs[np.argmax(psd)]
                    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10)) / np.log2(len(psd))
                    feature_dict[col + '_dominant_freq'] = dominant_freq
                    feature_dict[col + '_spectral_entropy'] = spectral_entropy

        feature_list.append(feature_dict)

    feature_df = pd.DataFrame(feature_list)
    return feature_df

# Example usage
# Assuming df is your DataFrame and you have handled NaNs
window_size = 50  # Example window size
step_size = 100    # Example step size for rolling computation
data = data.dropna()  # Drop rows with NaN values

# Extract rolling frequency features
rolling_frequency_features_df = rolling_frequency_features(data, window_size, step_size)

# Interpolate features to match the length of the original DataFrame
interpolated_features_df = interpolate_features(rolling_frequency_features_df, len(data), window_size, step_size)

# Verify the resulting DataFrame
interpolated_features_df




Unnamed: 0,rotationRateX_dominant_freq_0_0,rotationRateX_dominant_freq_0_1,rotationRateX_dominant_freq_0_2,rotationRateX_dominant_freq_0_3,rotationRateX_dominant_freq_0_4,rotationRateX_dominant_freq_0_5,rotationRateX_dominant_freq_0_6,rotationRateX_dominant_freq_0_7,rotationRateX_dominant_freq_0_8,rotationRateX_dominant_freq_0_9,...,sport_spectral_entropy_94600_94640,sport_spectral_entropy_94600_94641,sport_spectral_entropy_94600_94642,sport_spectral_entropy_94600_94643,sport_spectral_entropy_94600_94644,sport_spectral_entropy_94600_94645,sport_spectral_entropy_94600_94646,sport_spectral_entropy_94600_94647,sport_spectral_entropy_94600_94648,sport_spectral_entropy_94600_94649
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,,,,,,,,,,,...,,,,,,,,,,
943,,,,,,,,,,,...,,,,,,,,,,
944,,,,,,,,,,,...,,,,,,,,,,
945,,,,,,,,,,,...,,,,,,,,,,


In [ ]:
interpolated_features_df