

<!-- Slimmed-down, notebook-friendly Strava header -->

<div style="
  display: flex; 
  align-items: center; 
  justify-content: space-between;
  #border-left: 6px solid #fc4c02;
  padding: 0px 0px 5px 10px; 
  background: linear-gradient(to left, #fc4c02,rgba(255, 255, 255, 0.95));
  border-radius: 6px; 
  box-shadow: 0 1px 3px rgba(0,0,0,0.05);
  max-width: 96%;
  overflow: hidden;
">

  <!-- Left: Strava logo -->
  <img src="../assets/strava_logo.png" alt="Strava Logo" style="height: 80px; max-width: 80px;">

  <!-- Center: Text -->
  <div style="flex-grow: 1; text-align: center; padding: 0 10px;">
    <h3 style="margin: 0; color: #ffffff; font-size: 20px;">Strava Activity Analysis</h3>
    <p style="margin: 0; font-size: 14px;">
      Cycling 🚴‍♂️ | Running 🏃‍♀️ | Walking 🚶‍♂️
    </p>
    <p style="margin: 3px 0 0; font-size: 12px; color: #666;">
      <em>Insights from my personal Strava data</em>
    </p>
  </div>

  <!-- Right: Log icon -->
  <img src="../assets/Black_BRAINDUMP.png" alt="Log Icon" style="height: 150px; max-width: 150px;">
</div>

---



- Point at directory of new export
- Pull in 'activities'
- Build a top-level activity dataframe
- For run, walk and ride create 3 lists of files
- For each activity: 
    - create a dataframe of the time, heartrate, speed, lat/long and output


### Using the csv dataset exportable from Strava
   - Explore the datasource:
      - contains key 'activities' file; a high level overview of all activities I've tracked in Strava
      - subfiles (see below) listing detailed tracking information for each activity
   - Subfile datasets contain tracking in 1 of 3 formats:
      - .gpx: basic gps-tracking file, with no bio-stat data
      - .tcx.gz: blah blah
      - .fit.gz: blah
      - The most interesting and relevant file-type is XXX; this corresponds with when I started wearing an Apple Watch to track my training, and as such has the most valuable information

# Preamble & Imports

In [31]:
import os
import pandas as pd
import gpxpy
import gzip

from fitparse import FitFile

Locate main source file as downloaded from Strava

In [32]:
main_dir = '../strava_data_dumps/STRAVA+export_8029714'

## Pull in the Main "Activities.csv" File & Explore

In [33]:
# pull in file
head_df = pd.read_csv(f'{main_dir}/activities.csv')

# convert date times
head_df['Activity Date'] = pd.to_datetime(head_df['Activity Date'], format='%b %d, %Y, %I:%M:%S %p')

# some standard formatting I like to do to column names
head_df.columns = head_df.columns.str.upper().str.replace(' ', '_')
head_df.dropna(thresh=1000, axis=1, inplace=True)

head_df['FILE_SUFFIX'] = head_df['FILENAME'].str[-6:].str.replace('\d.', '', regex=True)


In [34]:
head_df_25 = head_df.loc[head_df['ACTIVITY_DATE'] > '2022-01-01']

In [35]:
head_df = head_df[['ACTIVITY_ID', 'FILENAME','FILE_SUFFIX', 'ACTIVITY_DATE', 'ACTIVITY_NAME', 'ACTIVITY_TYPE', 
         'CALORIES', 'AVERAGE_HEART_RATE', 'MAX_HEART_RATE', 'ELAPSED_TIME.1', 'MOVING_TIME', 'DISTANCE.1', 'MAX_SPEED', 'AVERAGE_SPEED', 'AVERAGE_ELAPSED_SPEED',
        'ELEVATION_GAIN', 'ELEVATION_LOSS', 'ELEVATION_LOW', 'ELEVATION_HIGH', 'MAX_GRADE', 'AVERAGE_GRADE']]

In [36]:
head_df = head_df.loc[head_df['ACTIVITY_TYPE'].isin(['Ride','Walk','Run'])].copy()
head_df.groupby(['FILE_SUFFIX','ACTIVITY_TYPE'])['ACTIVITY_ID'].nunique()

FILE_SUFFIX  ACTIVITY_TYPE
.gpx         Ride             187
             Run               48
             Walk              90
fit.gz       Ride             115
             Run               26
             Walk             544
tcx.gz       Ride             171
             Run               17
             Walk             103
Name: ACTIVITY_ID, dtype: int64

In [37]:
for n,g in head_df.groupby(['FILE_SUFFIX','ACTIVITY_TYPE']):
    print(n[0], '--', n[1])
    print(g['FILENAME'])

.gpx -- Ride
0         activities/261971610.gpx
2         activities/300397733.gpx
5         activities/510172119.gpx
7         activities/556158192.gpx
8         activities/575489495.gpx
                   ...            
715      activities/8656624411.gpx
716      activities/8700364349.gpx
717      activities/8733245910.gpx
1201    activities/10685697605.gpx
1334    activities/11561166175.gpx
Name: FILENAME, Length: 187, dtype: object
.gpx -- Run
3       activities/338178959.gpx
4       activities/338194381.gpx
6       activities/556084248.gpx
9       activities/588291107.gpx
10      activities/588292372.gpx
11      activities/650790613.gpx
338    activities/3646721779.gpx
340    activities/3656938870.gpx
341    activities/3667435866.gpx
343    activities/3681524574.gpx
345    activities/3691197201.gpx
348    activities/3705970303.gpx
351    activities/3769280034.gpx
352    activities/3803936148.gpx
353    activities/3809931147.gpx
354    activities/3814138735.gpx
355    activities/3

In [38]:
def semicircles_to_degrees(semicircles):
    
    """
    Converts lat/long from the fit format of semicircle --> normal lat/longs
    """

    return semicircles * (180 / 2**31)

In [39]:
# Walk through directories and subdirectories

#gpx_files = []
fit_files = []
#tcx_files = []

for root, dirs, files in os.walk(f'{main_dir}/activities'):

    for file in files:

        if file.endswith('.fit.gz'):
            file_path = os.path.join(root, file)
            print(f'fit file! {file_path}')
            all_records = []
            with gzip.open(file_path, 'rb') as f:
                fitfile = FitFile(f)
                for record in fitfile.get_messages('record'):
                    data = {d.name: d.value for d in record}
                    data['activity_id'] = str(file)[:-7]
                    all_records.append(data)
            # Convert to DataFrame
            df = pd.DataFrame(all_records)
            try:
                df['lat'] = df['position_lat'].apply(semicircles_to_degrees)
                df['long'] = df['position_long'].apply(semicircles_to_degrees)
            except:
                pass
            fit_files.append(df)

        # elif file.endswith('.gpx'):
        #     file_path = os.path.join(root, file)
        #     print(f'gpx file! {file_path}')
        #     with open(file_path, 'r') as gpx_file:
        #         gpx = gpxpy.parse(gpx_file)
        #         points = []
        #         for track in gpx.tracks:
        #             for segment in track.segments:
        #                 for point in segment.points:
        #                     points.append({
        #                         'ID':file,
        #                         'latitude': point.latitude,
        #                         'longitude': point.longitude,
        #                         'elevation': point.elevation,
        #                         'time': point.time,
        #                         'speed': point.speed,
        #                         'heart_rate': 'NaN'
        #                     })
        #     df = pd.DataFrame(points)
        #     gpx_files.append(df)

        else:
            file_path = os.path.join(root, file)
            print(f'other file! {file_path}')

    

fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/10778596247.fit.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/11061016795.fit.gz
other file! ../strava_data_dumps/STRAVA+export_8029714/activities/2727704737.tcx.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/12759412255.fit.gz
other file! ../strava_data_dumps/STRAVA+export_8029714/activities/4171869986.tcx.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/12852566380.fit.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/13282848920.fit.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/8920586120.fit.gz
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/9950417558.fit.gz
other file! ../strava_data_dumps/STRAVA+export_8029714/activities/5650748315.gpx
fit file! ../strava_data_dumps/STRAVA+export_8029714/activities/10751797773.fit.gz


KeyboardInterrupt: 

In [29]:
fit_df = pd.concat(fit_files, ignore_index=True)

In [30]:
fit_df

Unnamed: 0,distance,timestamp,activity_id,enhanced_altitude,enhanced_speed,gps_accuracy,position_lat,position_long,speed,heart_rate,lat,long
0,0.00,2023-10-18 11:21:02,10778596247,,,,,,,,,
1,,2023-10-18 11:21:02,10778596247,41.8,1.354,3.0,653310815.0,-18523787.0,1.354,,54.759880,-1.552646
2,,2023-10-18 11:21:03,10778596247,41.8,1.438,3.0,653310656.0,-18523762.0,1.438,,54.759867,-1.552644
3,,2023-10-18 11:21:04,10778596247,41.8,1.478,2.0,653310490.0,-18523729.0,1.478,105.0,54.759853,-1.552641
4,,2023-10-18 11:21:05,10778596247,41.8,1.495,2.0,653310319.0,-18523706.0,1.495,,54.759838,-1.552639
...,...,...,...,...,...,...,...,...,...,...,...,...
9373,,2023-06-16 11:29:59,9950417558,45.6,0.292,9.0,653313757.0,-18507767.0,0.292,,54.760127,-1.551303
9374,1711.92,2023-06-16 11:30:00,9950417558,45.6,0.263,10.0,653313727.0,-18507770.0,0.263,,54.760124,-1.551303
9375,,2023-06-16 11:30:01,9950417558,45.6,0.233,10.0,653313701.0,-18507772.0,0.233,,54.760122,-1.551304
9376,,2023-06-16 11:30:02,9950417558,45.6,0.211,11.0,653313677.0,-18507774.0,0.211,,54.760120,-1.551304


In [None]:
gpx_df = pd.concat(gpx_files, ignore_index=True)

gpx_df

# GPX Files

In [None]:
# Load a GPX file
with open(f'{main_dir}/261971610.gpx', 'r') as gpx_file:
    gpx = gpxpy.parse(gpx_file)

points = []

for track in gpx.tracks:
    for segment in track.segments:
        for point in segment.points:
            points.append({
                'latitude': point.latitude,
                'longitude': point.longitude,
                'elevation': point.elevation,
                'time': point.time,
                'speed': point.speed,
                'heart_rate': 'NaN'
            })

ride_gpx = pd.DataFrame(points)

# .fit Files

In [None]:
all_records = []

# Walk through directories and subdirectories
for root, dirs, files in os.walk(main_dir):
    for file in files:
        if file.endswith('.fit.gz'):
            file_path = os.path.join(root, file)
            print(file_path)
            with gzip.open(file_path, 'rb') as f:
                fitfile = FitFile(f)
                for record in fitfile.get_messages('record'):
                    data = {d.name: d.value for d in record}
                    data['source_file'] = file  # Optional: track which file it came from
                    all_records.append(data)

# Convert to DataFrame
ride_fit = pd.DataFrame(all_records)

def semicircles_to_degrees(semicircles):
    
    """
    Converts lat/long from the fit format of semicircle --> normal lat/longs
    """

    return semicircles * (180 / 2**31)

ride_fit['LAT'] = ride_fit['position_lat'].apply(semicircles_to_degrees)
ride_fit['LONG'] = ride_fit['position_long'].apply(semicircles_to_degrees)
#ride_fit.to_csv('GPS_route.csv')