# Data Preprocessing for Runner Injury Prediction

This notebook preprocesses both daily and weekly time-series data for training GRU/LSTM models to predict runner injuries. The preprocessing steps include:

1. Data loading and initial exploration
2. Data cleaning and handling missing values
3. Feature normalization/standardization
4. Sequence creation for time-series modeling (athlete-aware)
5. Temporal train/validation split
6. Saving preprocessed datasets

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)


In [2]:
# Load the datasets
day_df = pd.read_csv('data/day_approach_maskedID_timeseries.csv')
week_df = pd.read_csv('data/week_approach_maskedID_timeseries.csv')

# Convert Date column to datetime
day_df['Date'] = pd.to_datetime(day_df['Date'])
week_df['Date'] = pd.to_datetime(week_df['Date'])

print("Daily approach dataset shape:", day_df.shape)
print("\nWeekly approach dataset shape:", week_df.shape)

print("\nDaily approach columns:")
print(day_df.columns.tolist())
print("\nWeekly approach columns:")
print(week_df.columns.tolist())

# Print date range
print("\nDaily approach date range:")
print(f"Start: {day_df['Date'].min()}")
print(f"End: {day_df['Date'].max()}")


Daily approach dataset shape: (42766, 73)

Weekly approach dataset shape: (42798, 72)

Daily approach columns:
['nr. sessions', 'total km', 'km Z3-4', 'km Z5-T1-T2', 'km sprinting', 'strength training', 'hours alternative', 'perceived exertion', 'perceived trainingSuccess', 'perceived recovery', 'nr. sessions.1', 'total km.1', 'km Z3-4.1', 'km Z5-T1-T2.1', 'km sprinting.1', 'strength training.1', 'hours alternative.1', 'perceived exertion.1', 'perceived trainingSuccess.1', 'perceived recovery.1', 'nr. sessions.2', 'total km.2', 'km Z3-4.2', 'km Z5-T1-T2.2', 'km sprinting.2', 'strength training.2', 'hours alternative.2', 'perceived exertion.2', 'perceived trainingSuccess.2', 'perceived recovery.2', 'nr. sessions.3', 'total km.3', 'km Z3-4.3', 'km Z5-T1-T2.3', 'km sprinting.3', 'strength training.3', 'hours alternative.3', 'perceived exertion.3', 'perceived trainingSuccess.3', 'perceived recovery.3', 'nr. sessions.4', 'total km.4', 'km Z3-4.4', 'km Z5-T1-T2.4', 'km sprinting.4', 'strengt

In [3]:
def create_sequences_by_athlete(df, sequence_length, features, scaler=None):
    """
    Create sequences for each athlete separately to prevent data leakage between athletes.
    
    Args:
        df: pandas DataFrame containing the athlete data
        sequence_length: number of time steps to use for each sequence
        features: list of feature columns
        scaler: fitted StandardScaler (optional, for validation data)
    
    Returns:
        X: preprocessed features array of shape (n_samples, sequence_length, n_features)
        y: labels array of shape (n_samples,)
        scaler: fitted StandardScaler object (if not provided)
    """
    sequences = []
    labels = []
    
    # If scaler not provided, fit on this data
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df[features])
    
    # Process each athlete separately
    for athlete_id in df['Athlete ID'].unique():
        # Get athlete data sorted by date
        athlete_data = df[df['Athlete ID'] == athlete_id].sort_values('Date')
        
        # Scale features for this athlete
        scaled_features = scaler.transform(athlete_data[features])
        
        # Create sequences only within same athlete's data
        for i in range(len(scaled_features) - sequence_length):
            seq = scaled_features[i:(i + sequence_length)]
            label = athlete_data['injury'].iloc[i + sequence_length]
            
            sequences.append(seq)
            labels.append(label)
    
    return np.array(sequences), np.array(labels), scaler

def preprocess_timeseries(df, sequence_length=7, is_daily=True, train_end_date=None):
    """
    Preprocess the timeseries data for GRU/LSTM modeling.
    
    Args:
        df: pandas DataFrame containing the timeseries data
        sequence_length: number of time steps to use for each sequence
        is_daily: boolean indicating if this is daily data (True) or weekly data (False)
        train_end_date: datetime to split train/validation (if None, uses temporal split)
    
    Returns:
        X_train: training features array
        X_val: validation features array
        y_train: training labels array
        y_val: validation labels array
        scaler: fitted StandardScaler object
    """
    # Replace -0.01 values (missing data indicators) with NaN
    feature_cols = df.columns.drop(['injury', 'Athlete ID', 'Date']).tolist()
    df[feature_cols] = df[feature_cols].replace(-0.01, np.nan)
    
    # Handle missing values by forward fill within each athlete's data
    df[feature_cols] = df.groupby('Athlete ID')[feature_cols].fillna(method='ffill')
    # Any remaining NaN (at start of sequences) fill with 0
    df[feature_cols] = df[feature_cols].fillna(0)
    
    # If train_end_date not provided, use 80% of the date range
    if train_end_date is None:
        date_range = df['Date'].max() - df['Date'].min()
        train_end_date = df['Date'].min() + date_range * 0.8
    
    # Split data temporally
    train_df = df[df['Date'] <= train_end_date].copy()
    val_df = df[df['Date'] > train_end_date].copy()
    
    # Create sequences for training data
    X_train, y_train, scaler = create_sequences_by_athlete(
        train_df, sequence_length, feature_cols
    )
    
    # Create sequences for validation data using the same scaler
    X_val, y_val, _ = create_sequences_by_athlete(
        val_df, sequence_length, feature_cols, scaler
    )
    
    return X_train, X_val, y_train, y_val, scaler, feature_cols


In [4]:
# Process daily approach data
X_daily_train, X_daily_val, y_daily_train, y_daily_val, scaler_daily, daily_features = preprocess_timeseries(
    day_df, 
    sequence_length=7,  # Use 7 days of data to predict next day
    is_daily=True
)

# Process weekly approach data
X_weekly_train, X_weekly_val, y_weekly_train, y_weekly_val, scaler_weekly, weekly_features = preprocess_timeseries(
    week_df, 
    sequence_length=4,  # Use 4 weeks of data to predict next week
    is_daily=False
)

print("Daily approach preprocessed data shapes:")
print("X_daily_train shape:", X_daily_train.shape)
print("X_daily_val shape:", X_daily_val.shape)
print("y_daily_train shape:", y_daily_train.shape)
print("y_daily_val shape:", y_daily_val.shape)

print("\nWeekly approach preprocessed data shapes:")
print("X_weekly_train shape:", X_weekly_train.shape)
print("X_weekly_val shape:", X_weekly_val.shape)
print("y_weekly_train shape:", y_weekly_train.shape)
print("y_weekly_val shape:", y_weekly_val.shape)

# Print class distribution
print("\nDaily approach class distribution:")
print("Train - Injury rate: {:.2f}%".format(y_daily_train.mean() * 100))
print("Val - Injury rate: {:.2f}%".format(y_daily_val.mean() * 100))

print("\nWeekly approach class distribution:")
print("Train - Injury rate: {:.2f}%".format(y_weekly_train.mean() * 100))
print("Val - Injury rate: {:.2f}%".format(y_weekly_val.mean() * 100))


Daily approach preprocessed data shapes:
X_daily shape: (42759, 7, 70)
y_daily shape: (42759,)

Weekly approach preprocessed data shapes:
X_weekly shape: (42794, 4, 69)
y_weekly shape: (42794,)


In [5]:
# Create preprocessed data directory if it doesn't exist
os.makedirs('preprocessed_data', exist_ok=True)

# Save preprocessed data
np.save('preprocessed_data/X_daily_train.npy', X_daily_train)
np.save('preprocessed_data/X_daily_val.npy', X_daily_val)
np.save('preprocessed_data/y_daily_train.npy', y_daily_train)
np.save('preprocessed_data/y_daily_val.npy', y_daily_val)

np.save('preprocessed_data/X_weekly_train.npy', X_weekly_train)
np.save('preprocessed_data/X_weekly_val.npy', X_weekly_val)
np.save('preprocessed_data/y_weekly_train.npy', y_weekly_train)
np.save('preprocessed_data/y_weekly_val.npy', y_weekly_val)

# Save feature names and scalers
import pickle

with open('preprocessed_data/daily_features.pkl', 'wb') as f:
    pickle.dump(daily_features, f)
    
with open('preprocessed_data/weekly_features.pkl', 'wb') as f:
    pickle.dump(weekly_features, f)
    
with open('preprocessed_data/scaler_daily.pkl', 'wb') as f:
    pickle.dump(scaler_daily, f)
    
with open('preprocessed_data/scaler_weekly.pkl', 'wb') as f:
    pickle.dump(scaler_weekly, f)

print("Preprocessed data saved to 'preprocessed_data' directory")
print("\nFiles saved:")
print(os.listdir('preprocessed_data'))


Daily approach train/val shapes:
X_daily_train: (34207, 7, 70)
X_daily_val: (8552, 7, 70)
y_daily_train: (34207,)
y_daily_val: (8552,)

Weekly approach train/val shapes:
X_weekly_train: (34235, 4, 69)
X_weekly_val: (8559, 4, 69)
y_weekly_train: (34235,)
y_weekly_val: (8559,)

Daily approach class distribution:
Train - Injury rate: 1.36%
Val - Injury rate: 1.37%

Weekly approach class distribution:
Train - Injury rate: 1.34%
Val - Injury rate: 1.34%


In [6]:
# Create preprocessed data directory if it doesn't exist
os.makedirs('preprocessed_data', exist_ok=True)

# Save preprocessed data
np.save('preprocessed_data/X_daily_train.npy', X_daily_train)
np.save('preprocessed_data/X_daily_val.npy', X_daily_val)
np.save('preprocessed_data/y_daily_train.npy', y_daily_train)
np.save('preprocessed_data/y_daily_val.npy', y_daily_val)

np.save('preprocessed_data/X_weekly_train.npy', X_weekly_train)
np.save('preprocessed_data/X_weekly_val.npy', X_weekly_val)
np.save('preprocessed_data/y_weekly_train.npy', y_weekly_train)
np.save('preprocessed_data/y_weekly_val.npy', y_weekly_val)

# Save feature names and scalers
import pickle

with open('preprocessed_data/daily_features.pkl', 'wb') as f:
    pickle.dump(daily_features, f)
    
with open('preprocessed_data/weekly_features.pkl', 'wb') as f:
    pickle.dump(weekly_features, f)
    
with open('preprocessed_data/scaler_daily.pkl', 'wb') as f:
    pickle.dump(scaler_daily, f)
    
with open('preprocessed_data/scaler_weekly.pkl', 'wb') as f:
    pickle.dump(scaler_weekly, f)

print("Preprocessed data saved to 'preprocessed_data' directory")
print("\nFiles saved:")
print(os.listdir('preprocessed_data'))


Preprocessed data saved to 'preprocessed_data' directory

Files saved:
['weekly_features.pkl', 'X_daily_val.npy', 'y_weekly_val.npy', 'daily_features.pkl', 'y_daily_train.npy', 'y_weekly_train.npy', 'X_daily_train.npy', 'scaler_weekly.pkl', 'X_weekly_train.npy', 'scaler_daily.pkl', 'y_daily_val.npy', 'X_weekly_val.npy']
