In [9]:
# import packages

import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', None)

In [10]:
link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTk6t3NoeAIDC82RnfQO9As7eyuvVRx-Y8uwHoeJqefuXNvMhiyFdD0TizGwu1ktsnOeWGm9J0LahpR/pub?output=csv'
strava_data = pd.read_csv(link, header=0)
runs_df = strava_data.loc[strava_data['type'] == 'Run']
runs_df.tail()

Unnamed: 0,name,distance_m,moving_time_s,elapsed_time_s,elevation_change_m,type,date,average_cadence,average_heartrate,max_heartrate,pace_per_mile,distance_mi,id
168,Evening Run,2445,908,930,7.1,Run,6/18/2024,82.6,131.6,147,9:58,1.52,
169,Evening Run,4921,1818,1831,17.1,Run,2024-06-20 20:07:24,83.0,150.2,164,9:55,3.06,
170,Evening Run,5989,2153,2170,16.5,Run,2024-06-27 19:25:37,82.8,137.0,151,9:38,3.72,11756600000.0
171,Morning Run,4392,1459,1499,16.6,Run,2024-06-29 10:02:16,83.5,143.5,157,8:55,2.73,11769990000.0
172,Morning Run,11289,3765,3774,43.2,Run,2024-06-30 10:22:11,85.2,148.9,166,8:57,7.01,11777170000.0


In [22]:
runs_df['date'] = pd.to_datetime(runs_df['date'], format='mixed')
runs_df['moving_time_s'] = pd.to_timedelta(runs_df['moving_time_s'], unit='s')
runs_df['pace_per_mile'] = runs_df['moving_time_s'] / runs_df['distance_mi']
runs_df.sort_values(by=['date']).tail(10)

Unnamed: 0,name,distance_m,moving_time_s,elapsed_time_s,elevation_change_m,type,date,average_cadence,average_heartrate,max_heartrate,pace_per_mile,distance_mi,id,time_of_day,average_zone,max_zone,ratio_avg_hr_to_max_hr
163,Evening Run,2811,0 days 00:15:54,981,-73.0,Run,2024-06-04,84.3,146.3,160,0 days 00:09:05.142857142,1.75,,evening,3,4,0.914375
164,Evening Run,4819,0 days 00:30:40,1957,-28.4,Run,2024-06-06,83.1,135.8,145,0 days 00:10:13.333333333,3.0,,evening,3,3,0.936552
165,Morning Run,3287,0 days 00:14:55,1527,-15.6,Run,2024-06-08,88.8,152.0,181,0 days 00:07:18.725490196,2.04,,morning,3,5,0.839779
166,Morning Run,10027,0 days 00:54:53,3307,19.2,Run,2024-06-09,85.5,156.9,170,0 days 00:08:48.571428571,6.23,,morning,4,4,0.922941
167,Morning Run,3993,0 days 00:20:26,1229,0.0,Run,2024-06-16,83.2,139.2,154,0 days 00:08:14.354838709,2.48,,morning,3,4,0.903896
168,Evening Run,2445,0 days 00:15:08,930,7.1,Run,2024-06-18,82.6,131.6,147,0 days 00:09:57.368421052,1.52,,evening,2,3,0.895238
169,Evening Run,4921,0 days 00:30:18,1831,17.1,Run,2024-06-20,83.0,150.2,164,0 days 00:09:54.117647058,3.06,,evening,3,4,0.915854
170,Evening Run,5989,0 days 00:35:53,2170,16.5,Run,2024-06-27,82.8,137.0,151,0 days 00:09:38.763440860,3.72,11756600000.0,evening,3,3,0.907285
171,Morning Run,4392,0 days 00:24:19,1499,16.6,Run,2024-06-29,83.5,143.5,157,0 days 00:08:54.432234432,2.73,11769990000.0,morning,3,4,0.914013
172,Morning Run,11289,0 days 01:02:45,3774,43.2,Run,2024-06-30,85.2,148.9,166,0 days 00:08:57.089871611,7.01,11777170000.0,morning,3,4,0.896988


In [12]:
runs_df.isna().sum()

name                    0
distance_m              0
moving_time_s           0
elapsed_time_s          0
elevation_change_m      0
type                    0
date                    0
average_cadence         0
average_heartrate       0
max_heartrate           0
pace_per_mile           0
distance_mi             0
id                    170
dtype: int64

In [13]:
# convert column names to lower case and snake case
column_names = {column: column.strip().lower().replace(' ', '_').replace('.1', '') for column in runs_df.columns}
runs_df = runs_df.rename(columns=column_names)

In [14]:
# convert name to a categorical for time of day for the run
runs_df['time_of_day'] = runs_df['name'].str.lower().str.split(' ').str[0]
runs_df['time_of_day'] = runs_df['time_of_day'].replace({'lunch': 'afternoon'}).astype('category')
runs_df['time_of_day']

0        evening
1        morning
2        evening
3      afternoon
4      afternoon
         ...    
168      evening
169      evening
170      evening
171      morning
172      morning
Name: time_of_day, Length: 173, dtype: category
Categories (4, object): ['afternoon', 'evening', 'morning', 'night']

In [15]:
# determine average running zone based on averagee heart rate

AGE = 29
MAX_HEART_RATE = 220 - AGE

def find_zone(avg_heart_rate):
    if avg_heart_rate < 0.6 * MAX_HEART_RATE:
        return 1
    elif avg_heart_rate < 0.7 * MAX_HEART_RATE:
        return 2
    elif avg_heart_rate < 0.8 * MAX_HEART_RATE:
        return 3
    elif avg_heart_rate < 0.9 * MAX_HEART_RATE:
        return 4
    else:
        return 5

In [16]:
runs_df['average_zone'] = runs_df['average_heartrate'].apply(find_zone)
runs_df['max_zone']  = runs_df['max_heartrate'].apply(find_zone)
runs_df['ratio_avg_hr_to_max_hr'] = runs_df['average_heartrate'] / runs_df['max_heartrate']

In [17]:
runs_df.to_csv('run_data.csv', index=False)

In [18]:
def load_df():
    return runs_df