# Data Ingestion for My Run Forecast

Loading in Garmin's .fit files and quick heart rate, pace, and elevation plots.

## Data Sources
### Input: 
.fit activity files from Garmin Connect.

.gpx route files from Strava.

### Output:
Parquet files containing the activity data.

In [1]:
import os, sys
from pathlib import Path

import numpy as np
from fitparse import FitFile

# Add utils/ to path
cwd = Path.cwd()
for base in (cwd, cwd.parent, cwd.parent.parent):
    if (base / "utils").exists():
        sys.path.insert(0, str(base))
        break

from utils.features import elapsed_time, semicircle_to_degrees
from utils.fit import fit_to_df
from config import (
    DATA_PATH,
    EXPECTED_FIT_COLUMNS,
    GARMIN_FIT_ACTIVITIES_PATH,
    PARQUET_RUN_ACTIVITIES_PATH
)

In [2]:
activity_file_name = sorted(
    [f for f in os.listdir(".." / DATA_PATH) if f.endswith('.fit')]
)[-1]
print(f"Loading {activity_file_name}...")
df = fit_to_df(
    FitFile(str(".." / DATA_PATH / activity_file_name))
)

# Remove 'unknown' columns not in EXPECTED_FIT_COLUMNS
df = df[[col for col in df.columns if col in EXPECTED_FIT_COLUMNS]]
df.head()

Loading 2025-06-30-15-07-06.fit...


Unnamed: 0,accumulated_power,activity_type,cadence,distance,enhanced_altitude,enhanced_speed,fractional_cadence,heart_rate,position_lat,position_long,power,stance_time,stance_time_balance,stance_time_percent,step_length,temperature,timestamp,vertical_oscillation,vertical_ratio
0,106,running,48,1.99,54.0,1.318,0.5,101,457455223,-1455557466,106,,,,822.0,34,2025-06-30 22:07:06,57.9,7.05
1,198,running,48,3.91,54.0,1.318,0.5,100,457455161,-1455557734,92,,,,822.0,34,2025-06-30 22:07:07,57.9,7.05
2,317,running,48,6.13,53.8,1.318,0.5,100,457455054,-1455558061,119,,,,822.0,34,2025-06-30 22:07:08,57.9,7.05
3,467,running,48,8.69,53.6,1.318,0.5,100,457454958,-1455558381,150,,,,1125.0,34,2025-06-30 22:07:09,61.9,5.5
4,616,running,48,11.54,53.6,1.802,0.5,100,457454792,-1455558667,149,,,,1323.0,34,2025-06-30 22:07:10,64.4,4.87


## Basic Data Exploration

In [3]:
# Basic data info
print(f"Dataset shape: {df.shape}")
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)

Dataset shape: (4658, 19)

Column names:
['accumulated_power', 'activity_type', 'cadence', 'distance', 'enhanced_altitude', 'enhanced_speed', 'fractional_cadence', 'heart_rate', 'position_lat', 'position_long', 'power', 'stance_time', 'stance_time_balance', 'stance_time_percent', 'step_length', 'temperature', 'timestamp', 'vertical_oscillation', 'vertical_ratio']

Data types:
accumulated_power                int64
activity_type                   object
cadence                          int64
distance                       float64
enhanced_altitude              float64
enhanced_speed                 float64
fractional_cadence             float64
heart_rate                       int64
position_lat                     int64
position_long                    int64
power                            int64
stance_time                    float64
stance_time_balance             object
stance_time_percent             object
step_length                    float64
temperature                      int

In [4]:
# Quick check for missing values
missing_data = df.isnull().sum()
print("Missing values per column:")
print(missing_data[missing_data > 0])

Missing values per column:
stance_time             3041
stance_time_balance     4658
stance_time_percent     4658
step_length              929
vertical_oscillation     146
vertical_ratio           929
dtype: int64


This may seem alarming but features like `stance_time` and `step_length` may have missing values when in motion or stopped at rest. 

In [5]:
# Basic statistics for numeric columns
print("Basic statistics:")
print(df.describe())

Basic statistics:
       accumulated_power      cadence     distance  enhanced_altitude  \
count       4.658000e+03  4658.000000  4658.000000        4658.000000   
mean        5.592561e+05    59.425505  4288.118029         140.811249   
min         1.060000e+02     0.000000     1.990000          51.200000   
25%         3.036318e+05    51.000000  2240.080000          76.600000   
50%         5.383315e+05    55.000000  3890.575000         131.900000   
75%         8.525650e+05    77.000000  6740.125000         194.550000   
max         1.079569e+06   109.000000  9027.470000         292.800000   
std         3.150076e+05    16.799329  2654.424535          68.171048   

       enhanced_speed  fractional_cadence   heart_rate  position_lat  \
count     4658.000000         4658.000000  4658.000000  4.658000e+03   
mean         1.934871            0.304745   169.650279  4.573198e+08   
min          0.000000            0.000000   100.000000  4.571618e+08   
25%          1.339000            0.0

In [6]:
# GPS check
if 'position_lat' in df.columns and 'position_long' in df.columns:
    # Convert GPS coordinates from semicircles to degrees
    df['lat_deg'] = semicircle_to_degrees(df['position_lat'])
    df['long_deg'] = semicircle_to_degrees(df['position_long'])

    print(f"GPS data available: {df[['lat_deg', 'long_deg']].dropna().shape[0]} points")
    print(f"Lat range: {df['position_lat'].min():.4f} to {df['position_lat'].max():.4f}")
    print(f"Lon range: {df['position_long'].min():.4f} to {df['position_long'].max():.4f}")
else:
    print("No GPS data found")

GPS data available: 4658 points
Lat range: 457161837.0000 to 457458197.0000
Lon range: -1455643108.0000 to -1455427065.0000


In [7]:
df['elapsed_time'] = elapsed_time(df['timestamp'])

if 'enhanced_speed' in df.columns:
    # Convert from m/s to pace (min/mi)
    df['pace_min_mi'] = np.where(
        df['enhanced_speed'] > 0,
        (1609.34 / 60) / df['enhanced_speed'],
        np.nan
    )

print(f"\nDataFrame now has {df.shape[1]} columns")


DataFrame now has 23 columns


In [8]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Heart Rate Over Time",
        "Pace Over Distance",
        "GPS Track",
        "Elevation Profile"
    )
)

# Heart rate over time
if 'heart_rate' in df.columns and 'elapsed_time' in df.columns:
    fig.add_trace(
        go.Scatter(
            x=df['elapsed_time'] / 60,
            y=df['heart_rate'],
            mode='lines',
            name='Heart Rate'
        ),
        row=1, col=1
    )

# Pace over distance
if 'pace_min_mi' in df.columns and 'distance' in df.columns:
    fig.add_trace(
        go.Scatter(
            x=df['distance'] / 1000,
            y=df['pace_min_mi'],
            mode='lines',
            name='Pace'
        ),
        row=1, col=2
    )
    fig.update_yaxes(autorange="reversed", row=1, col=2)

# GPS track
if 'lat_deg' in df.columns and 'long_deg' in df.columns:
    fig.add_trace(
        go.Scatter(
            x=df['long_deg'],
            y=df['lat_deg'],
            mode='lines',
            name='GPS Track'
        ),
        row=2, col=1
    )
    # Set axis ranges and aspect ratio for GPS subplot (x3/y3)
    fig.update_yaxes(
        scaleanchor="x3",
        scaleratio=1,
        row=2, col=1,
        range=[df['lat_deg'].min(), df['lat_deg'].max()]
    )
    fig.update_xaxes(
        row=2, col=1,
        range=[df['long_deg'].min(), df['long_deg'].max()]
    )

# Elevation profile
if 'enhanced_altitude' in df.columns and 'distance' in df.columns:
    fig.add_trace(
        go.Scatter(
            x=df['distance'] / 1000,
            y=df['enhanced_altitude'],
            mode='lines',
            name='Elevation'
        ),
        row=2, col=2
    )

fig.update_layout(height=800, width=1200, showlegend=False)
fig.show()

In [9]:
# Save full data to parquet
df.to_parquet(
    ".." / DATA_PATH / activity_file_name.replace(".fit", ".parquet"),
    index=False
)