In [16]:
# import packages

import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', None)

In [17]:
link = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTk6t3NoeAIDC82RnfQO9As7eyuvVRx-Y8uwHoeJqefuXNvMhiyFdD0TizGwu1ktsnOeWGm9J0LahpR/pub?output=csv'
strava_data = pd.read_csv(link, header=0)
runs_df = strava_data.loc[strava_data['type'] == 'Run']
runs_df.tail()

Unnamed: 0,name,distance_m,moving_time_s,elapsed_time_s,elevation_change_m,type,date,average_cadence,average_heartrate,max_heartrate,pace_per_mile,distance_mi,id
173,Evening Run,3361,1301,1381,12.0,Run,2024-07-03 20:13:00,83.4,149.0,161,10:23,2.09,11803280000.0
174,Lunch Run,5455,1837,1841,12.8,Run,2024-07-04 12:28:25,84.7,145.7,165,9:02,3.39,11809430000.0
175,Morning Run,6643,2091,2098,17.7,Run,2024-07-06 10:32:56,85.9,154.9,172,8:27,4.13,11824170000.0
176,Afternoon Run,9989,3327,3327,33.8,Run,2024-07-07 17:38:54,85.7,148.3,166,8:56,6.21,11834140000.0
177,Evening Run,6974,2368,2399,21.3,Run,2024-07-09 19:11:55,83.7,146.6,160,9:06,4.33,11850750000.0


In [18]:
runs_df['date'] = pd.to_datetime(runs_df['date'], format='mixed')
runs_df['moving_time_s'] = pd.to_timedelta(runs_df['moving_time_s'], unit='s')
runs_df['pace_per_mile'] = runs_df['moving_time_s'] / runs_df['distance_mi']
runs_df.sort_values(by=['date']).tail(10)

Unnamed: 0,name,distance_m,moving_time_s,elapsed_time_s,elevation_change_m,type,date,average_cadence,average_heartrate,max_heartrate,pace_per_mile,distance_mi,id
168,Evening Run,2445,0 days 00:15:08,930,7.1,Run,2024-06-18 00:00:00,82.6,131.6,147,0 days 00:09:57.368421052,1.52,
169,Evening Run,4921,0 days 00:30:18,1831,17.1,Run,2024-06-20 20:07:24,83.0,150.2,164,0 days 00:09:54.117647058,3.06,
170,Evening Run,5989,0 days 00:35:53,2170,16.5,Run,2024-06-27 19:25:37,82.8,137.0,151,0 days 00:09:38.763440860,3.72,11756600000.0
171,Morning Run,4392,0 days 00:24:19,1499,16.6,Run,2024-06-29 10:02:16,83.5,143.5,157,0 days 00:08:54.432234432,2.73,11769990000.0
172,Morning Run,11289,0 days 01:02:45,3774,43.2,Run,2024-06-30 10:22:11,85.2,148.9,166,0 days 00:08:57.089871611,7.01,11777170000.0
173,Evening Run,3361,0 days 00:21:41,1381,12.0,Run,2024-07-03 20:13:00,83.4,149.0,161,0 days 00:10:22.488038277,2.09,11803280000.0
174,Lunch Run,5455,0 days 00:30:37,1841,12.8,Run,2024-07-04 12:28:25,84.7,145.7,165,0 days 00:09:01.887905604,3.39,11809430000.0
175,Morning Run,6643,0 days 00:34:51,2098,17.7,Run,2024-07-06 10:32:56,85.9,154.9,172,0 days 00:08:26.295399515,4.13,11824170000.0
176,Afternoon Run,9989,0 days 00:55:27,3327,33.8,Run,2024-07-07 17:38:54,85.7,148.3,166,0 days 00:08:55.748792270,6.21,11834140000.0
177,Evening Run,6974,0 days 00:39:28,2399,21.3,Run,2024-07-09 19:11:55,83.7,146.6,160,0 days 00:09:06.882217090,4.33,11850750000.0


In [19]:
runs_df.isna().sum()

name                    0
distance_m              0
moving_time_s           0
elapsed_time_s          0
elevation_change_m      0
type                    0
date                    0
average_cadence         0
average_heartrate       0
max_heartrate           0
pace_per_mile           0
distance_mi             0
id                    170
dtype: int64

In [20]:
# convert column names to lower case and snake case
column_names = {column: column.strip().lower().replace(' ', '_').replace('.1', '') for column in runs_df.columns}
runs_df = runs_df.rename(columns=column_names)

In [21]:
# convert name to a categorical for time of day for the run
runs_df['time_of_day'] = runs_df['name'].str.lower().str.split(' ').str[0]
runs_df['time_of_day'] = runs_df['time_of_day'].replace({'lunch': 'afternoon'}).astype('category')
runs_df['time_of_day']

0        evening
1        morning
2        evening
3      afternoon
4      afternoon
         ...    
173      evening
174    afternoon
175      morning
176    afternoon
177      evening
Name: time_of_day, Length: 178, dtype: category
Categories (4, object): ['afternoon', 'evening', 'morning', 'night']

In [22]:
# determine average running zone based on averagee heart rate

AGE = 29
MAX_HEART_RATE = 220 - AGE

def find_zone(avg_heart_rate):
    if avg_heart_rate < 0.6 * MAX_HEART_RATE:
        return 1
    elif avg_heart_rate < 0.7 * MAX_HEART_RATE:
        return 2
    elif avg_heart_rate < 0.8 * MAX_HEART_RATE:
        return 3
    elif avg_heart_rate < 0.9 * MAX_HEART_RATE:
        return 4
    else:
        return 5

In [23]:
runs_df['average_zone'] = runs_df['average_heartrate'].apply(find_zone)
runs_df['max_zone']  = runs_df['max_heartrate'].apply(find_zone)
runs_df['ratio_avg_hr_to_max_hr'] = runs_df['average_heartrate'] / runs_df['max_heartrate']

In [24]:
runs_df.to_csv('run_data.csv', index=False)

In [25]:
def load_df():
    return runs_df