# Data Ingestion for My Run Forecast

## Data Sources
### Input: 
.fit activity files from Garmin Connect.

.gpx route files from Strava.

### Output:
Parquet files containing the activity data.

In [1]:
import os, sys
from pathlib import Path

import pandas as pd

# Add utils/ to path
cwd = Path.cwd()
for base in (cwd, cwd.parent, cwd.parent.parent):
    if (base / "utils").exists():
        sys.path.insert(0, str(base))
        break

from utils.fit import fit_to_df
from config import EXPECTED_FIT_COLUMNS

routes_path = "../data/"

In [2]:
activity_file_name = sorted(os.listdir(routes_path))[0]
df_1 = fit_to_df(routes_path + activity_file_name)

# Remove 'unknown' columns not in EXPECTED_FIT_COLUMNS
df_1 = df_1[[col for col in df_1.columns if col in EXPECTED_FIT_COLUMNS]]
df_1.head()

Unnamed: 0,accumulated_power,activity_type,cadence,distance,enhanced_altitude,enhanced_speed,fractional_cadence,heart_rate,position_lat,position_long,power,stance_time,stance_time_balance,stance_time_percent,step_length,temperature,timestamp,vertical_oscillation,vertical_ratio
0,106,running,48,1.99,54.0,1.318,0.5,101,457455223,-1455557466,106,,,,822.0,34,2025-06-30 22:07:06,57.9,7.05
1,198,running,48,3.91,54.0,1.318,0.5,100,457455161,-1455557734,92,,,,822.0,34,2025-06-30 22:07:07,57.9,7.05
2,317,running,48,6.13,53.8,1.318,0.5,100,457455054,-1455558061,119,,,,822.0,34,2025-06-30 22:07:08,57.9,7.05
3,467,running,48,8.69,53.6,1.318,0.5,100,457454958,-1455558381,150,,,,1125.0,34,2025-06-30 22:07:09,61.9,5.5
4,616,running,48,11.54,53.6,1.802,0.5,100,457454792,-1455558667,149,,,,1323.0,34,2025-06-30 22:07:10,64.4,4.87


In [3]:
# Save full data to parquet
df_1.to_parquet(
    routes_path + activity_file_name.replace(".fit", ".parquet"),
    index=False
)