# CSC 2621 Final Project: Running (Away)
### Members: Alex Ewart, Mikhail Filippov, Benjamin Liebl

In this Final Project, we will perform statistical analyses and use different models for a [Running](https://www.kaggle.com/datasets/mexwell/long-distance-running-dataset?resource=download&select=run_ww_2019_w.csv) dataset. Our hypotheses are:
1. During the year 2020, athletes ran **less** distance overall than the same athletes in the year 2019.
2. We can predict a runner's mileage in a given week by featurizing time series data from previous weeks/years

In [59]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dill
import os

if os.path.exists('final_project.db'):
    dill.load_session('final_project.db')

In [60]:
if not os.path.exists('final_project.db'):
    df_2019 = pd.read_csv('../run_ww_2019_w.csv')
    df_2020 = pd.read_csv('../run_ww_2020_w.csv')

    # Combine the dataframes
    df = pd.concat([df_2019, df_2020], ignore_index=True)
    
df

Unnamed: 0.1,Unnamed: 0,datetime,athlete,distance,duration,gender,age_group,country,major
0,0,2019-01-01,0,0.000000,0.000000,F,18 - 34,United States,CHICAGO 2019
1,1,2019-01-01,1,5.270000,30.200000,M,35 - 54,Germany,BERLIN 2016
2,2,2019-01-01,2,9.300000,98.000000,M,35 - 54,United Kingdom,"LONDON 2018,LONDON 2019"
3,3,2019-01-01,3,103.130000,453.400000,M,18 - 34,United Kingdom,LONDON 2017
4,4,2019-01-01,4,34.670000,185.650000,M,35 - 54,United States,BOSTON 2017
...,...,...,...,...,...,...,...,...,...
3786843,1893419,2020-12-23,37594,128.154444,572.587037,M,18 - 34,United Kingdom,BERLIN 2017
3786844,1893420,2020-12-23,37595,20.051111,87.461111,M,18 - 34,United States,"BERLIN 2019,NEW YORK 2015"
3786845,1893421,2020-12-23,37596,144.635556,625.774074,M,18 - 34,United States,BOSTON 2017
3786846,1893422,2020-12-23,37597,0.000000,0.000000,F,18 - 34,United States,BOSTON 2015


In [61]:
df.columns

Index(['Unnamed: 0', 'datetime', 'athlete', 'distance', 'duration', 'gender',
       'age_group', 'country', 'major'],
      dtype='object')

### Data Modeling and Prediction

### XGBoost and Random Forest
TODO

# ARIMA and SARIMA

In [62]:
marathon_map = {
    'CHICAGO': '10-12',
    'BERLIN': '09-21',
    'LONDON': '04-27',
    'BOSTON': '04-21',
    'NEW YORK': '11-02'
}

In [63]:
from datetime import timedelta
df['datetime'] = pd.to_datetime(df['datetime'])

df_expanded = df.copy()
df_expanded['major_split'] = df_expanded['major'].str.split(',')
df_expanded = df_expanded.explode('major_split')

In [64]:
df_expanded[['event', 'year']] = df_expanded['major_split'].str.extract(r'(\D+)\s+(\d{4})')
df_expanded['event'] = df_expanded['event'].str.strip()
df_expanded['year'] = df_expanded['year'].astype(int)
df_expanded['major_date'] = pd.to_datetime(
    df_expanded['year'].astype(str) + '-' + df_expanded['event'].map(marathon_map),
    errors='coerce'
)

In [65]:
one_month = pd.Timedelta(days=30)

# Check conditions
df_expanded['within_month_before'] = (
    (df_expanded['datetime'] > df_expanded['major_date'] - one_month) &
    (df_expanded['datetime'] <= df_expanded['major_date'])
)

df_expanded['within_month_after'] = (
    (df_expanded['datetime'] > df_expanded['major_date']) &
    (df_expanded['datetime'] <= df_expanded['major_date'] + one_month)
)

# Group back to original rows and aggregate using any()
df_result = df_expanded.groupby(df_expanded.index)[['within_month_before', 'within_month_after']].any()
df_result

Unnamed: 0,within_month_before,within_month_after
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
3786843,False,False
3786844,False,False
3786845,False,False
3786846,False,False


In [66]:
# Merge back into original df
df = df.join(df_result)

In [67]:
df = df.drop(columns='major')

In [69]:
df.columns

Index(['Unnamed: 0', 'datetime', 'athlete', 'distance', 'duration', 'gender',
       'age_group', 'country', 'within_month_before', 'within_month_after'],
      dtype='object')

In [70]:
!pip install keras
!pip install tensorflow
!pip install 




[notice] A new release of pip is available: 24.2 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tqdm import tqdm

# --- Load & prepare ---
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(['athlete', 'datetime'])

# Drop any row with missing required values (or handle them differently)
df = df.dropna(subset=['distance', 'duration', 'gender', 'age_group', 'country'])

# --- Feature columns setup ---
time_steps = 10
target_col = 'distance'


In [80]:
encoded_static

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16861111 stored elements and shape (3752632, 138)>

In [77]:
len(encoded_feature_names)

138

In [83]:

static_features = ['gender', 'age_group', 'country']
dynamic_features = ['distance', 'duration', 'within_month_before', 'within_month_after']

# --- Categorical encoding ---
encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), static_features)
    ],
    remainder='passthrough'
)

# Fit the encoder on all data to get column names and transformation
encoded_static = encoder.fit_transform(df[static_features + dynamic_features])
encoded_feature_names = encoder.get_feature_names_out()



In [84]:
assert encoded_static.shape[0] == df['athlete'].shape[0] == df['datetime'].shape[0] == df[target_col].shape[0]


In [88]:
# Replace features with encoded versions
processed_df = pd.DataFrame(encoded_static.toarray(), columns=encoded_feature_names)


In [None]:
# --- Normalize per athlete ---
X_seqs, y_targets = [], []
scalers = {}

for athlete_id, group in tqdm(processed_df.groupby('athlete')):
    group = group.sort_values('datetime')
    if len(group) <= time_steps:
        continue
    
    feature_cols = [col for col in processed_df.columns if col not in ['athlete', 'datetime', target_col]]
    
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(group[feature_cols])
    scalers[athlete_id] = scaler

    for i in range(len(group) - time_steps):
        X_seqs.append(scaled[i:i+time_steps])
        # predict next unscaled duration
        y_targets.append(group[target_col].values[i+time_steps])

# Final arrays
X = np.array(X_seqs)
y = np.array(y_targets).reshape(-1, 1)

print(f"LSTM Input Shape: {X.shape}")  # (samples, time_steps, features)

# --- LSTM Model ---
model = Sequential()
model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2]), activation='relu'))
model.add(Dense(1))  # Predict next duration
model.compile(loss='mse', optimizer='adam')
model.summary()

model.fit(X, y, epochs=10, batch_size=64, verbose=1)

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt


ts = df['value']

# Fit ARIMA(p,d,q): ARIMA(1,1,1) as example
model = ARIMA(ts, order=(1, 1, 1))  # p=AR, d=diff, q=MA
results = model.fit()

# Forecasting
forecast = results.forecast(steps=10)
print(forecast)

# Plot
ts.plot(label='Original')
forecast.plot(label='Forecast', legend=True)
plt.show()


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Fit SARIMA(1,1,1)(1,1,1,12) for monthly data with yearly seasonality
model = SARIMAX(ts, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results = model.fit()

# Forecast
forecast = results.forecast(steps=12)
forecast.plot(label='Forecast')
ts.plot(label='Original', legend=True)
plt.show()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape to 3D (samples, time_steps, features)
X = X.reshape((X.shape[0], X.shape[1], 1))

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X, y, epochs=20, verbose=1)


### Results
TODO

In [None]:
# run to save state
dill.dump_session('final_project.db')