In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dill
import os

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

### Load Data

In [2]:
if os.path.exists('final_project.db'):
    dill.load_session('final_project.db')

In [3]:
if not os.path.exists('final_project.db'):
    df_2019 = pd.read_csv('../run_ww_2019_w.csv')
    df_2020 = pd.read_csv('../run_ww_2020_w.csv')

In [4]:
# convert 2019 objects to correct types
df_2019['datetime'] = pd.to_datetime(df_2019['datetime'], format='%Y-%m-%d')
df_2019['gender'] = df_2019['gender'].astype('category')
df_2019['age_group'] = df_2019['age_group'].astype('category')
df_2019['country'] = df_2019['country'].astype('category')
df_2019['major'] = df_2019['major'].astype('category')
df_2019.drop(columns=['Unnamed: 0'], inplace=True)

# convert 2020 objects to correct types
df_2020['datetime'] = pd.to_datetime(df_2020['datetime'], format='%Y-%m-%d')
df_2020['gender'] = df_2020['gender'].astype('category')
df_2020['age_group'] = df_2020['age_group'].astype('category')
df_2020['country'] = df_2020['country'].astype('category')
df_2020['major'] = df_2020['major'].astype('category')
df_2020.drop(columns=['Unnamed: 0'], inplace=True)

display(df_2019.head())
display(df_2020.head())

Unnamed: 0,datetime,athlete,distance,duration,gender,age_group,country,major
0,2019-01-01,0,0.0,0.0,F,18 - 34,United States,CHICAGO 2019
1,2019-01-01,1,5.27,30.2,M,35 - 54,Germany,BERLIN 2016
2,2019-01-01,2,9.3,98.0,M,35 - 54,United Kingdom,"LONDON 2018,LONDON 2019"
3,2019-01-01,3,103.13,453.4,M,18 - 34,United Kingdom,LONDON 2017
4,2019-01-01,4,34.67,185.65,M,35 - 54,United States,BOSTON 2017


Unnamed: 0,datetime,athlete,distance,duration,gender,age_group,country,major
0,2020-01-01,0,0.0,0.0,F,18 - 34,United States,CHICAGO 2019
1,2020-01-01,1,70.33,394.2,M,35 - 54,Germany,BERLIN 2016
2,2020-01-01,2,14.65,79.066667,M,35 - 54,United Kingdom,"LONDON 2018,LONDON 2019"
3,2020-01-01,3,41.41,195.666667,M,18 - 34,United Kingdom,LONDON 2017
4,2020-01-01,4,41.34,209.1,M,35 - 54,United States,BOSTON 2017


### Add marathon features

In [5]:
marathon_map = {
    'CHICAGO': '10-12',
    'BERLIN': '09-21',
    'LONDON': '04-27',
    'BOSTON': '04-21',
    'NEW YORK': '11-02'
}
from datetime import timedelta
df_2019['datetime'] = pd.to_datetime(df_2019['datetime'])

df_expanded = df_2019.copy()
df_expanded['major_split'] = df_expanded['major'].str.split(',')
df_expanded = df_expanded.explode('major_split')

df_expanded[['event', 'year']] = df_expanded['major_split'].str.extract(r'(\D+)\s+(\d{4})')
df_expanded['event'] = df_expanded['event'].str.strip()
df_expanded['year'] = df_expanded['year'].astype(int)
df_expanded['major_date'] = pd.to_datetime(
    df_expanded['year'].astype(str) + '-' + df_expanded['event'].map(marathon_map),
    errors='coerce'
)

In [6]:
one_month = pd.Timedelta(days=30)

# Check conditions
df_expanded['within-month-before'] = (
    (df_expanded['datetime'] > df_expanded['major_date'] - one_month) &
    (df_expanded['datetime'] <= df_expanded['major_date'])
)

df_expanded['within-month-after'] = (
    (df_expanded['datetime'] > df_expanded['major_date']) &
    (df_expanded['datetime'] <= df_expanded['major_date'] + one_month)
)

# Group back to original rows and aggregate using any()
df_result = df_expanded.groupby(df_expanded.index)[['within-month-before', 'within-month-after']].any()
df_result

Unnamed: 0,within-month-before,within-month-after
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1893419,False,False
1893420,False,False
1893421,False,False
1893422,False,False


In [7]:
df_2019 = df_2019.join(df_result)

In [11]:
df_2019_new = df_2019.pivot_table(
    index='athlete',
    columns='datetime',
    values=['distance', 'duration', 'within-month-before', 'within-month-after'],
    aggfunc='sum',
    fill_value=0
)
df_2019_new.columns = [
    f'{val}_week_{date.isocalendar()[1]}' for val, date in df_2019_new.columns
]


df_2019_new = df_2019_new.reset_index()
mask = ~df_2019['athlete'].duplicated()
df_2019_new['age_group'] = df_2019[mask]['age_group']
df_2019_new['country'] = df_2019[mask]['country']
df_2019_new['gender'] = df_2019[mask]['gender']
df_2019_new['major'] = df_2019[mask]['major']
age_map = {}
# compute mean age for each age group to convert to numeric
for age_group in df_2019_new['age_group'].unique():
    ages_split = age_group.split()
    mean_age = 0
    if ages_split[1] == '-':
        mean_age = (int(ages_split[0]) + int(ages_split[2])) / 2
    else:
        mean_age = (55 + 75) / 2
    age_map[age_group] = mean_age
df_2019_new['age_group'] = pd.Series(df_2019_new['age_group'].map(age_map), dtype=float)
df_2019_new = pd.get_dummies(df_2019_new, columns=['country'])
df_2019_new

Unnamed: 0,athlete,distance_week_1,distance_week_2,distance_week_3,distance_week_4,distance_week_5,distance_week_6,distance_week_7,distance_week_8,distance_week_9,...,country_Uganda,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_Zimbabwe
0,0,0.00,0.000,0.00,0.000,0.000,0.00,0.000,0.00,0.000,...,False,False,False,False,True,False,False,False,False,False
1,1,5.27,59.860,55.99,58.500,58.180,51.59,63.710,62.04,52.480,...,False,False,False,False,False,False,False,False,False,False
2,2,9.30,30.820,10.01,54.340,37.099,58.28,61.690,61.16,71.319,...,False,False,False,True,False,False,False,False,False,False
3,3,103.13,93.100,87.40,97.840,54.870,9.76,87.260,4.88,41.060,...,False,False,False,True,False,False,False,False,False,False
4,4,34.67,0.000,30.51,38.680,0.000,38.30,0.000,8.66,10.160,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36407,37594,168.05,113.140,163.52,161.509,163.320,123.18,66.189,88.89,149.859,...,False,False,False,True,False,False,False,False,False,False
36408,37595,79.81,114.879,113.51,91.680,128.270,136.32,121.530,127.39,134.540,...,False,False,False,False,True,False,False,False,False,False
36409,37596,118.89,111.070,117.22,136.400,134.308,136.25,118.340,90.93,92.400,...,False,False,False,False,True,False,False,False,False,False
36410,37597,28.67,54.410,49.88,41.220,48.930,50.09,75.060,23.43,72.260,...,False,False,False,False,True,False,False,False,False,False


In [None]:
country_columns = df_2019_new.columns[df_2019_new.columns.str.startswith('country_')]

X_2019 = df_2019_new.drop(columns=['gender','major','athlete'] + list(country_columns))
y_2019 = df_2019_new['gender']
X_2019_train, X_2019_test, y_2019_train, y_2019_test = train_test_split(X_2019, y_2019, test_size=0.3)

In [None]:
X_2019

In [None]:
rf = RandomForestClassifier(max_depth=5)
rf_cv_score = cross_val_score(rf, X_2019_train, y_2019_train, cv=5)
print(f'Random Forest Classifier CV Score: {rf_cv_score.mean()}')
rf.fit(X_2019_train, y_2019_train)
print(f'Random Forest Classifier Train Score: {rf.score(X_2019_train, y_2019_train)}')
print(f'Random Forest Classifier Test Score: {rf.score(X_2019_test, y_2019_test)}')

In [None]:
df_2019_new

### Predicting Next Week's Distance based on previous X weeks

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
import numpy as np

def get_features_and_target(week_x, df=df_2019_new, x=3, use_season=True):
    target = f'distance_week_{week_x}'
    
    # Select time-based features
    features = [
        col for col in df.columns
        if 'week' in col
        # and 'within' not in col
        # and 'duration' not in col
        and (week_x - int(col.split('_')[2])) <= x
        and (week_x - int(col.split('_')[2])) > 0
    ]
    if use_season:
        # Base feature set
        X = df[features + ['gender', 'age_group']].copy()

        X['gender'] = X['gender'].eq('M')

        # One-hot encode age_group
        X = pd.get_dummies(X, columns=['gender'], drop_first=False)
    else:
        X = df[features]

    y = df[target]
    return X, y


def train_model_with_kfold(X, y, model_type='rf', k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    models = []
    rmses = []
    r2_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        if model_type == 'rf':
            model = RandomForestRegressor(
                n_estimators=100,
                max_depth=5,
                min_samples_leaf=5,
                random_state=42
            )
        elif model_type == 'xgboost':
            model = XGBRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                objective='reg:squarederror',
                random_state=42
            )
        else:
            raise ValueError("model_type must be 'rf' or 'xgboost'")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred) ** 0.5
        r2 = r2_score(y_val, y_pred)

        rmses.append(rmse)
        r2_scores.append(r2)
        models.append(model)

    avg_rmse = np.mean(rmses)
    avg_r2 = np.mean(r2_scores)

    print(f'Average RMSE across {k} folds ({model_type}): {avg_rmse:.4f}')
    print(f'Average R² Score across {k} folds ({model_type}): {avg_r2:.4f}')

    return models[0]  # return the first trained model

def predict_distance(week_x, model, X, y, use_average=False):
    if use_average:
        # Predict using the average of each row's 'distance' columns
        distance_cols = [col for col in X.columns if 'distance' in col]
        y_pred = X[distance_cols].mean(axis=1)
        
        # Compute RMSE and R² using per-row averages
        rmse = mean_squared_error(y, y_pred) ** 0.5
        score = r2_score(y, y_pred)
        
        print(f'Per-athlete (row-wise average) RMSE for week {week_x}: {rmse}')
        print(f'Per-athlete (row-wise average) R² score for week {week_x}: {score}')
    else:
        y_pred = model.predict(X)
        rmse = mean_squared_error(y, y_pred) ** 0.5
        score = model.score(X, y)
        
        print(f'Model RMSE for week {week_x}: {rmse}')
        print(f'Model R² score for week {week_x}: {score}')




In [None]:
X, y = get_features_and_target(35, x=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
model_rf = train_model_with_kfold(X_train, y_train, model_type='rf')

In [None]:
model_xg = train_model_with_kfold(X_train, y_train, model_type='xgboost')

In [None]:
predict_distance(35, model_rf, X=X_test, y=y_test)

In [None]:
predict_distance(35, model_xg, X=X_test, y=y_test)

In [None]:
predict_distance(35, None, X=X_test, y=y_test, use_average=True)

### LSTM model

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tqdm import tqdm

In [9]:
df = df_2019
df['week'] = df['datetime'].apply(lambda date: date.isocalendar()[1])
df['gender'] = df['gender'].eq('M')

In [12]:
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(['athlete', 'datetime'])

# Drop any row with missing required values (or handle them differently)
df = df.dropna(subset=['distance', 'duration', 'gender', 'age_group', 'country'])


target_col = 'distance'
df['age_group'] = df['age_group'].apply(lambda x: age_map[x])

In [13]:
df = df.drop(columns=['country', 'major'])

In [18]:
lookback = 5
week = 35

In [19]:
# Filter your DataFrame
X = df[(df['week'] <= week) & (df['week'] >= week - lookback)]

# Get unique athletes
unique_athletes = X['athlete'].unique()
np.random.seed(42)
np.random.shuffle(unique_athletes)  # Shuffle to randomize the split

# Split indices
split_idx = int(len(unique_athletes) * 0.7)
train_athletes = unique_athletes[:split_idx]
test_athletes = unique_athletes[split_idx:]

# Create train and test sets based on athlete inclusion
X_train = X[X['athlete'].isin(train_athletes)]
X_test = X[X['athlete'].isin(test_athletes)]

In [20]:

feature_cols = [col for col in X_train.columns if col not in ['athlete', 'datetime', 'within-month-before', 'within-month-after', 'week', 'gender', 'duration']]

In [21]:
X_seqs, y_targets = [], []
scalers = {}

for athlete_id, group in tqdm(X.groupby('athlete')):
    group = group.sort_values('datetime')
    if len(group) < lookback+1:
        print("BAD")
        continue
    
    
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(group[feature_cols])
    scalers[athlete_id] = scaler

    target_vals = group[target_col].values

    # Create non-overlapping sequences
    for i in range(0, len(group) - lookback + 2, lookback + 1):
        X_seqs.append(scaled[i:i+lookback])              # first 5
        y_targets.append(target_vals[i+lookback])        # 6th

100%|██████████| 36083/36083 [01:30<00:00, 397.37it/s]


In [22]:
# Convert to numpy arrays
X = np.array(X_seqs)  # shape: (samples, time_steps, features)
y = np.array(y_targets).reshape(-1, 1)

print(f"LSTM Input Shape: {X.shape}")  # (samples, time_steps, features)

# --- LSTM Model ---
model = Sequential()

model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), activation='sigmoid'))
model.add(Dense(1))  # Output: predict next duration
model.compile(loss='mse', optimizer='adam')
model.summary()

# Train the model
model.fit(X, y, epochs=5, batch_size=64, verbose=1)

LSTM Input Shape: (36083, 5, 2)


  super().__init__(**kwargs)


Epoch 1/5
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 1283.1725
Epoch 2/5
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 1031.0537
Epoch 3/5
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - loss: 1046.1333
Epoch 4/5
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - loss: 972.0887
Epoch 5/5
[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 881.0052


<keras.src.callbacks.history.History at 0x2558fc42060>

In [23]:
X_test_seqs = []
y_test_targets = []

for athlete_id, group in X_test.groupby('athlete'):
    group = group.sort_values('datetime')
    if len(group) < 5:  # Ensure each athlete has enough data
        continue
    
    # Scale using the scaler trained on the training data
    scaled = scalers[athlete_id].transform(group[feature_cols])
    
    # Create sequences of 5 time steps
    for i in range(len(group) - lookback):
        X_test_seqs.append(scaled[i:i+lookback])  # 5 time steps for each sample
        y_test_targets.append(group[target_col].values[i+lookback])  # Actual value to predict (next value)

# Convert to numpy arrays
X_test_input = np.array(X_test_seqs)
y_test_input = np.array(y_test_targets).reshape(-1, 1)

print(f"X_test Shape: {X_test_input.shape}")


X_test Shape: (10825, 5, 2)


In [24]:
# Predict on the test set
y_pred = model.predict(X_test_input)


[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [25]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_input, y_pred))
print(f"RMSE: {rmse}")


RMSE: 29.61234964064593


In [27]:
# Accuracy within multiple tolerance levels
tolerances = [round(t, 1) for t in np.arange(0.1, 1.1, 0.1)]

for tolerance in tolerances:
    within_tolerance = np.abs(y_test_input - y_pred) <= tolerance
    accuracy = np.mean(within_tolerance)
    print(f"Accuracy within ±{tolerance}: {accuracy * 100:.2f}%")


Accuracy within ±0.1: 0.21%
Accuracy within ±0.2: 0.42%
Accuracy within ±0.3: 0.70%
Accuracy within ±0.4: 0.98%
Accuracy within ±0.5: 1.24%
Accuracy within ±0.6: 1.48%
Accuracy within ±0.7: 1.73%
Accuracy within ±0.8: 1.93%
Accuracy within ±0.9: 2.11%
Accuracy within ±1.0: 2.24%


In [None]:
X_test[X_test['athlete'] == 1]

In [None]:
X_input.shape

In [None]:
X_train.shape