# Time Prediction Models and Training
Run the cells below to train the time prediction models on the parquet activities, inspect the fitted coefficients, and persist weights for the CLI.

In [1]:
import sys
from pathlib import Path

import plotly.express as px

# Add utils/ to path
cwd = Path.cwd()
for base in (cwd, cwd.parent, cwd.parent.parent):
    if (base / "utils").exists():
        sys.path.insert(0, str(base))
        break

from config import PARQUET_RUN_ACTIVITIES_PATH
from utils import activity
from models import time_linear

In [2]:
ACTIVITY_DIR = ".."/ PARQUET_RUN_ACTIVITIES_PATH

summaries = activity.load_activity_summaries(ACTIVITY_DIR)
summaries.head()

Unnamed: 0,activity_path,activity_date,activity_type,activity_subtype,elapsed_time_hours,distance_m,distance_mi,elevation_gain_m,elevation_gain_ft,average_hr,avg_cadence,avg_power
0,../data/parquet_run_activities/2024-12-20-14-4...,2024-12-20 22:47:54,running,trail,1.785833,11609.04,7.213521,494.6,1622.703464,160.349767,51.336081,228.303577
1,../data/parquet_run_activities/2024-12-30-14-4...,2024-12-30 22:40:32,running,generic,0.539167,5249.27,3.261744,68.8,225.721792,149.572606,82.396756,346.368177
2,../data/parquet_run_activities/2025-01-01-13-3...,2025-01-01 21:38:04,running,trail,1.450833,2125.1,1.320476,240.8,790.026272,101.085949,11.212002,28.032542
3,../data/parquet_run_activities/2025-01-02-14-4...,2025-01-02 22:42:38,running,trail,1.305,10725.48,6.664502,359.6,1179.790064,177.811236,75.026069,309.50798
4,../data/parquet_run_activities/2025-01-03-11-3...,2025-01-03 19:31:53,running,generic,0.9225,8314.35,5.166296,77.0,252.62468,159.863335,79.612733,308.890126


In [3]:
model = time_linear.train_linear_time_model(summaries)
model.to_series()

intercept                 2.674574
coef_distance_mi          0.235853
coef_elevation_gain_ft   -0.000130
coef_average_hr          -0.002256
coef_avg_cadence         -0.023518
coef_avg_power           -0.002325
dtype: float64

In [4]:
eval_columns = ['elapsed_time_hours', *time_linear.LINEAR_FEATURES]
evaluation_df = summaries.dropna(subset=eval_columns).copy()
feature_matrix = evaluation_df[list(time_linear.LINEAR_FEATURES)].to_numpy(dtype=float)

evaluation_df['predicted_hours'] = model.intercept + feature_matrix @ model.coefficients
evaluation_df['residual_hours'] = evaluation_df['elapsed_time_hours'] - evaluation_df['predicted_hours']
evaluation_df[['elapsed_time_hours', 'predicted_hours', 'residual_hours']].describe()

Unnamed: 0,elapsed_time_hours,predicted_hours,residual_hours
count,110.0,110.0,110.0
mean,1.525189,1.525189,1.616687e-14
std,0.90965,0.745003,0.5219515
min,0.069167,0.261172,-0.8752581
25%,1.080972,1.077968,-0.16291
50%,1.277639,1.416276,-0.04234075
75%,1.644097,1.763187,0.04430149
max,6.449167,4.295529,4.698566


In [5]:
evaluation_df.head()

Unnamed: 0,activity_path,activity_date,activity_type,activity_subtype,elapsed_time_hours,distance_m,distance_mi,elevation_gain_m,elevation_gain_ft,average_hr,avg_cadence,avg_power,predicted_hours,residual_hours
0,../data/parquet_run_activities/2024-12-20-14-4...,2024-12-20 22:47:54,running,trail,1.785833,11609.04,7.213521,494.6,1622.703464,160.349767,51.336081,228.303577,2.064402,-0.278569
1,../data/parquet_run_activities/2024-12-30-14-4...,2024-12-30 22:40:32,running,generic,0.539167,5249.27,3.261744,68.8,225.721792,149.572606,82.396756,346.368177,0.333735,0.205432
2,../data/parquet_run_activities/2025-01-01-13-3...,2025-01-01 21:38:04,running,trail,1.450833,2125.1,1.320476,240.8,790.026272,101.085949,11.212002,28.032542,2.326091,-0.875258
3,../data/parquet_run_activities/2025-01-02-14-4...,2025-01-02 22:42:38,running,trail,1.305,10725.48,6.664502,359.6,1179.790064,177.811236,75.026069,309.50798,1.207276,0.097724
4,../data/parquet_run_activities/2025-01-03-11-3...,2025-01-03 19:31:53,running,generic,0.9225,8314.35,5.166296,77.0,252.62468,159.863335,79.612733,308.890126,0.908828,0.013672


In [6]:
max_hours = float(max(
    evaluation_df['elapsed_time_hours'].max(),
    evaluation_df['predicted_hours'].max()
))

fig = px.scatter(
    evaluation_df,
    x='elapsed_time_hours',
    y='predicted_hours',
    title='Linear model fit',
    labels={
        'elapsed_time_hours': 'Actual time (hours)',
        'predicted_hours': 'Predicted time (hours)',
        'activity_date': 'Activity date',
        'distance_mi': 'Distance (miles)',
        'elevation_gain_ft': 'Elevation gain (feet)',
        'activity_type': 'Activity type',
        'activity_subtype': 'Activity subtype'
    },
    hover_data=[
        'activity_date', 'distance_mi', 'elevation_gain_ft',
        'activity_type', 'activity_subtype'
    ],
    width=600, height=600,
    opacity=0.7
)

fig.add_shape(
    type='line', line=dict(dash='dash', color='black', width=1),
    x0=0, x1=max_hours, y0=0, y1=max_hours
)

fig.show()

In [7]:
fig = px.scatter(
    evaluation_df,
    x='distance_mi',
    y='residual_hours',
    title='Residuals vs distance',
    labels={
        'distance_mi': 'Distance (mi)',
        'residual_hours': 'Residual (hours)'
    },
    opacity=0.7,
    width=700, height=400
)

fig.add_hline(y=0, line_dash="dash", line_color="black", line_width=1)

fig.show()

In [None]:
# Save linear model weights
WEIGHTS_PATH = Path("../models/weights/time_linear_weights.json")
# time_linear.save_model(model, WEIGHTS_PATH)
print(model.to_series())
print("Saved linear model weights to", WEIGHTS_PATH)

intercept                 2.674574
coef_distance_mi          0.235853
coef_elevation_gain_ft   -0.000130
coef_average_hr          -0.002256
coef_avg_cadence         -0.023518
coef_avg_power           -0.002325
dtype: float64
Saved linear model weights to ../models/weights/time_linear_weights.json
