<!-- Fancy Notebook Header with Strava styling -->

<div style="background: linear-gradient(to right, #fc4c02, #ff814c); padding: 20px; border-radius: 10px; display: flex; align-items: center;">

  <img src="../assets/strava_logo.png" alt="Strava Logo" style="width: 80px; margin-right: 20px;">

  <div style="color: white;">
    <h1 style="margin: 0;">Strava Data Munge & Export</h1>
    <h3 style="margin: 0;">Cycling üö¥‚Äç‚ôÇÔ∏è | Running üèÉ‚Äç‚ôÄÔ∏è | Walking üö∂‚Äç‚ôÇÔ∏è</h3>
    <p style="margin: 5px 0 0 0;"><em>Data sourced from my personal Strava activity logs</em></p>
  </div>

</div>

---


- Point at directory of new export
- Pull in 'activities'
- Build a top-level activity dataframe
- For run, walk and ride create 3 lists of files
- For each activity: 
    - create a dataframe of the time, heartrate, speed, lat/long and output


### Using the csv dataset exportable from Strava
   - Explore the datasource:
      - contains key 'activities' file; a high level overview of all activities I've tracked in Strava
      - subfiles (see below) listing detailed tracking information for each activity
   - Subfile datasets contain tracking in 1 of 3 formats:
      - .gpx: basic gps-tracking file, with no bio-stat data
      - .tcx.gz: blah blah
      - .fit.gz: blah
      - The most interesting and relevant file-type is XXX; this corresponds with when I started wearing an Apple Watch to track my training, and as such has the most valuable information

# Preamble & Imports

In [None]:
import os
import pandas as pd
import gpxpy
import gzip

from fitparse import FitFile

Locate main source file as downloaded from Strava

In [2]:
main_dir = '../strava_data_dumps/STRAVA+export_8029714'

## Pull in the Main "Activities.csv" File & Explore

In [None]:
# pull in file
head_df = pd.read_csv(f'{main_dir}/activities.csv')

# convert date times
head_df['Activity Date'] = pd.to_datetime(head_df['Activity Date'], format='%b %d, %Y, %I:%M:%S %p')

# some standard formatting I like to do to column names
head_df.columns = head_df.columns.str.upper().str.replace(' ', '_')
head_df.dropna(thresh=1000, axis=1, inplace=True)

head_df['FILE_SUFFIX'] = head_df['FILENAME'].str[-6:].str.replace('\d.', '', regex=True)


In [None]:
head_df_25 = head_df.loc[head_df['ACTIVITY_DATE'] > '2022-01-01']

In [None]:
head_df = head_df[['ACTIVITY_ID', 'FILENAME','FILE_SUFFIX', 'ACTIVITY_DATE', 'ACTIVITY_NAME', 'ACTIVITY_TYPE', 
         'CALORIES', 'AVERAGE_HEART_RATE', 'MAX_HEART_RATE', 'ELAPSED_TIME.1', 'MOVING_TIME', 'DISTANCE.1', 'MAX_SPEED', 'AVERAGE_SPEED', 'AVERAGE_ELAPSED_SPEED',
        'ELEVATION_GAIN', 'ELEVATION_LOSS', 'ELEVATION_LOW', 'ELEVATION_HIGH', 'MAX_GRADE', 'AVERAGE_GRADE']]

In [None]:
head_df = head_df.loc[head_df['ACTIVITY_TYPE'].isin(['Ride','Walk','Run'])].copy()
head_df.groupby(['FILE_SUFFIX','ACTIVITY_TYPE'])['ACTIVITY_ID'].nunique()

In [None]:
for n,g in head_df.groupby(['FILE_SUFFIX','ACTIVITY_TYPE']):
    print(n[0], '--', n[1])
    print(g['FILENAME'])

In [None]:
def semicircles_to_degrees(semicircles):
    
    """
    Converts lat/long from the fit format of semicircle --> normal lat/longs
    """

    return semicircles * (180 / 2**31)

In [None]:
# Walk through directories and subdirectories

gpx_files = []
fit_files = []
#tcx_files = []

for root, dirs, files in os.walk(f'{main_dir}/activities'):

    for file in files:

        if file.endswith('.fit.gz'):
            file_path = os.path.join(root, file)
            print(f'fit file! {file_path}')
            all_records = []
            with gzip.open(file_path, 'rb') as f:
                fitfile = FitFile(f)
                for record in fitfile.get_messages('record'):
                    data = {d.name: d.value for d in record}
                    data['source_file'] = file  # Optional: track which file it came from
                    all_records.append(data)
            # Convert to DataFrame
            df = pd.DataFrame(all_records)
            try:
                df['LAT'] = df['position_lat'].apply(semicircles_to_degrees)
                df['LONG'] = df['position_long'].apply(semicircles_to_degrees)
            except:
                pass
            fit_files.append(df)

        elif file.endswith('.gpx'):
            file_path = os.path.join(root, file)
            print(f'gpx file! {file_path}')
            with open(file_path, 'r') as gpx_file:
                gpx = gpxpy.parse(gpx_file)
                points = []
                for track in gpx.tracks:
                    for segment in track.segments:
                        for point in segment.points:
                            points.append({
                                'ID':file,
                                'latitude': point.latitude,
                                'longitude': point.longitude,
                                'elevation': point.elevation,
                                'time': point.time,
                                'speed': point.speed,
                                'heart_rate': 'NaN'
                            })
            df = pd.DataFrame(points)
            gpx_files.append(df)

        else:
            file_path = os.path.join(root, file)
            print(f'other file! {file_path}')

    

In [None]:
gpx_df = pd.concat(gpx_files, ignore_index=True)

gpx_df

In [None]:
fit_df = pd.concat(fit_files, ignore_index=True)

fit_df

# GPX Files

In [None]:
# Load a GPX file
with open(f'{main_dir}/261971610.gpx', 'r') as gpx_file:
    gpx = gpxpy.parse(gpx_file)

points = []

for track in gpx.tracks:
    for segment in track.segments:
        for point in segment.points:
            points.append({
                'latitude': point.latitude,
                'longitude': point.longitude,
                'elevation': point.elevation,
                'time': point.time,
                'speed': point.speed,
                'heart_rate': 'NaN'
            })

ride_gpx = pd.DataFrame(points)

# .fit Files

In [None]:
all_records = []

# Walk through directories and subdirectories
for root, dirs, files in os.walk(main_dir):
    for file in files:
        if file.endswith('.fit.gz'):
            file_path = os.path.join(root, file)
            print(file_path)
            with gzip.open(file_path, 'rb') as f:
                fitfile = FitFile(f)
                for record in fitfile.get_messages('record'):
                    data = {d.name: d.value for d in record}
                    data['source_file'] = file  # Optional: track which file it came from
                    all_records.append(data)

# Convert to DataFrame
ride_fit = pd.DataFrame(all_records)

def semicircles_to_degrees(semicircles):
    
    """
    Converts lat/long from the fit format of semicircle --> normal lat/longs
    """

    return semicircles * (180 / 2**31)

ride_fit['LAT'] = ride_fit['position_lat'].apply(semicircles_to_degrees)
ride_fit['LONG'] = ride_fit['position_long'].apply(semicircles_to_degrees)
#ride_fit.to_csv('GPS_route.csv')