In [98]:
import pandas as pd
import ast
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preprocessing

## 1. Merge Garmin and Strava Activity Datasets

In [101]:
garmin_df = pd.read_csv("erick-garmin-activities.csv")
strava_df = pd.read_csv("max-strava-activities.csv")

### 1.1 Filter for Only Runs

In [103]:
garmin_runs_df = garmin_df[garmin_df["Activity Type"] == "Running"]
strava_runs_df = strava_df[strava_df["type"] == "Run"]

### 1.2 Extract Average GAP from Strava DF

In [105]:
def compute_weighted_avg_grade_adjusted_speed(splits):
    try:
        splits = ast.literal_eval(splits)
        total_weighted_speed = sum(split['average_grade_adjusted_speed'] * split['distance'] for split in splits)
        total_distance = sum(split['distance'] for split in splits)
        weighted_avg_speed = total_weighted_speed / total_distance if total_distance != 0 else 0
        return weighted_avg_speed
    except Exception:
        return None # We will decide whether we want GAP or not later on (removes ~26 runs if not).

In [106]:
strava_runs_df["average_grade_adjusted_speed"] = strava_runs_df["splits_metric"].apply(compute_weighted_avg_grade_adjusted_speed)

### 1.3 Column Unit Conversions
| Garmin    | Garmin Units | Strava | Strava Units |
| --------- | ------------ | ------ | ------------ |
| Distance | km | distance | m |
| Avg Run Cadence | spm | average_cadence | half of garmin's value |
| Avg Pace | min/km | average_speed | m/s |
| Best Pace | min/km | max_speed | m/s |
| Avg GAP | min/km | average_grade_adjusted_speed | m/s |
| Elapsed Time | min:sec | elapsed_time | sec |
| Moving Time | min:sec | moving_time | sec |

In [108]:
def pace_to_mps(pace_str):
    if pace_str == "--":
        return None
    mins, secs = map(int, pace_str.split(":"))
    minutes_per_km = mins + secs / 60
    meters_per_second = 1000 / (minutes_per_km * 60)
    return meters_per_second

In [109]:
def time_str_to_sec(time):
    hrs, mins, secs = map(float, time.split(":"))
    return (hrs * 3600) + (mins * 60) + secs

In [110]:
# Convert Garmin units to Strava units (except cadence, which we will follow Garmin's units)
garmin_runs_df_converted = garmin_runs_df.copy()
strava_runs_df_converted = strava_runs_df.copy()
garmin_runs_df_converted["Distance"] *= 1000
# strava_runs_df_converted["average_cadence"] *= 2 # Apparently Apple Watch doesn't record cadence.
garmin_runs_df_converted["Avg Pace"] = garmin_runs_df_converted["Avg Pace"].apply(pace_to_mps)
garmin_runs_df_converted["Best Pace"] = garmin_runs_df_converted["Best Pace"].apply(pace_to_mps)
garmin_runs_df_converted["Avg GAP"] = garmin_runs_df_converted["Avg GAP"].apply(pace_to_mps)
garmin_runs_df_converted["Elapsed Time"] = garmin_runs_df_converted["Elapsed Time"].apply(time_str_to_sec)
garmin_runs_df_converted["Moving Time"] = garmin_runs_df_converted["Moving Time"].apply(time_str_to_sec)

### 1.4 Column Name Coversion & Merge

In [112]:
# Apple Watch doesn't seem to record power as well.
garmin_to_strava_cols = {
    "Distance": "distance",
    "Calories": "calories",
    "Avg HR": "average_heartrate",
    "Max HR": "max_heartrate",
    # "Avg Run Cadence": "average_cadence",
    "Avg Pace": "average_speed",
    "Best Pace": "max_speed",
    "Avg GAP": "average_grade_adjusted_speed",
    "Total Ascent": "total_elevation_gain",
    # "Avg Power": "average_watts",
    # "Max Power": "max_watts",
    "Elapsed Time": "elapsed_time",
    "Moving Time": "moving_time",
    "Min Elevation": "elev_low",
    "Max Elevation": "elev_high"
}

In [113]:
garmin_runs_df_converted = garmin_runs_df_converted.rename(columns=garmin_to_strava_cols)

desired_columns = list(garmin_to_strava_cols.values())
garmin_filtered_df = garmin_runs_df_converted[desired_columns]
strava_filtered_df = strava_runs_df_converted[desired_columns]

combined_df = pd.concat([garmin_filtered_df, strava_filtered_df], ignore_index=True)

In [114]:
# Dropped runs with null heartrate, grade adjusted speed, and elevation, reducing our data set by about 30 runs in total.
# figured this was better than replacing null values with averages
combined_df_null_removed = combined_df.dropna(subset=['average_heartrate', 'average_grade_adjusted_speed', 'elev_low', 'elev_high']).reset_index(drop=True)
combined_df_null_removed

Unnamed: 0,distance,calories,average_heartrate,max_heartrate,average_speed,max_speed,average_grade_adjusted_speed,total_elevation_gain,elapsed_time,moving_time,elev_low,elev_high
0,6510.0,351,158,175,3.311258,10.309278,3.311258,37,2014.0,1964.0,324,346
1,6180.0,317,149,171,3.322259,3.968254,3.300330,23,2005.0,1852.0,178,190
2,5040.0,269,156,170,3.267974,3.521127,3.257329,21,1580.0,1538.0,169,186
3,4930.0,259,157,169,3.378378,4.716981,3.355705,41,1492.0,1458.0,328,350
4,5510.0,297,156,171,3.194888,4.184100,3.154574,27,1778.0,1714.0,337,351
...,...,...,...,...,...,...,...,...,...,...,...,...
384,10473.1,530.0,143.8,166.0,3.035000,4.500000,3.027005,130.2,3847.0,3451.0,106.1,190.2
385,10309.2,536.0,150.6,172.0,2.652000,5.200000,2.737829,109.2,3941.0,3888.0,34.6,148.9
386,9971.0,518.0,142.6,175.0,2.912000,5.200000,3.174127,146.1,3458.0,3424.0,106.1,152.2
387,1836.2,91.0,112.0,112.0,3.680000,5.300000,3.598320,10.6,499.0,499.0,118.3,189.8


In [115]:
# Remove runs that have data that is above or below 3 z-scores of the average for its column, besides for distance and moving_time which can be highly variable
columns_to_include = [col for col in combined_df_null_removed.select_dtypes(include=['float64', 'int64']).columns if col not in ['distance', 'moving_time']]
z_scores = stats.zscore(combined_df_null_removed[columns_to_include])
z_score_threshold = 3
mask = (abs(z_scores) < z_score_threshold).all(axis=1)
combined_df_filtered = combined_df_null_removed[mask].reset_index(drop=True)

combined_df_filtered

Unnamed: 0,distance,calories,average_heartrate,max_heartrate,average_speed,max_speed,average_grade_adjusted_speed,total_elevation_gain,elapsed_time,moving_time,elev_low,elev_high
0,6510.0,351,158,175,3.311258,10.309278,3.311258,37,2014.0,1964.0,324,346
1,6180.0,317,149,171,3.322259,3.968254,3.300330,23,2005.0,1852.0,178,190
2,5040.0,269,156,170,3.267974,3.521127,3.257329,21,1580.0,1538.0,169,186
3,4930.0,259,157,169,3.378378,4.716981,3.355705,41,1492.0,1458.0,328,350
4,5510.0,297,156,171,3.194888,4.184100,3.154574,27,1778.0,1714.0,337,351
...,...,...,...,...,...,...,...,...,...,...,...,...
368,10473.1,530.0,143.8,166.0,3.035000,4.500000,3.027005,130.2,3847.0,3451.0,106.1,190.2
369,10309.2,536.0,150.6,172.0,2.652000,5.200000,2.737829,109.2,3941.0,3888.0,34.6,148.9
370,9971.0,518.0,142.6,175.0,2.912000,5.200000,3.174127,146.1,3458.0,3424.0,106.1,152.2
371,1836.2,91.0,112.0,112.0,3.680000,5.300000,3.598320,10.6,499.0,499.0,118.3,189.8


In [116]:
# Convert object columns to be numeric
combined_df_filtered['calories'] = pd.to_numeric(combined_df_filtered['calories'], errors='coerce')
combined_df_filtered['average_heartrate'] = pd.to_numeric(combined_df_filtered['average_heartrate'], errors='coerce')
combined_df_filtered['max_heartrate'] = pd.to_numeric(combined_df_filtered['max_heartrate'], errors='coerce')
combined_df_filtered['total_elevation_gain'] = pd.to_numeric(combined_df_filtered['total_elevation_gain'], errors='coerce')
combined_df_filtered['elev_low'] = pd.to_numeric(combined_df_filtered['elev_low'], errors='coerce')
combined_df_filtered['elev_high'] = pd.to_numeric(combined_df_filtered['elev_high'], errors='coerce')

In [117]:
# Standardize data to have a mean of 0 and a standard deviation of 1
columns_to_scale = combined_df_filtered.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
combined_df_scaled = combined_df_filtered.copy()  # Create a copy to keep the original intact
combined_df_scaled[columns_to_scale] = scaler.fit_transform(combined_df_scaled[columns_to_scale])
combined_df_scaled

Unnamed: 0,distance,calories,average_heartrate,max_heartrate,average_speed,max_speed,average_grade_adjusted_speed,total_elevation_gain,elapsed_time,moving_time,elev_low,elev_high
0,0.624165,0.655087,1.116701,0.776841,0.660822,3.558670,0.473422,-0.572274,0.218892,0.431056,1.510506,1.548365
1,0.446882,0.310736,0.174966,0.386458,0.703884,-0.876152,0.431843,-0.943081,0.204640,0.243666,0.016445,-0.427371
2,-0.165550,-0.175407,0.907427,0.288862,0.491389,-1.188867,0.268236,-0.996054,-0.468382,-0.281695,-0.075655,-0.478031
3,-0.224644,-0.276686,1.012064,0.191267,0.923559,-0.352503,0.642528,-0.466329,-0.607737,-0.415545,1.551440,1.599025
4,0.086944,0.108176,0.907427,0.386458,0.205301,-0.725192,-0.122718,-0.837136,-0.154833,0.012775,1.643539,1.611690
...,...,...,...,...,...,...,...,...,...,...,...,...
368,2.753226,2.467993,-0.369148,-0.101521,-0.420569,-0.504257,-0.608085,1.896246,3.121595,2.918991,-0.719329,-0.424838
369,2.665175,2.528760,0.342385,0.484054,-1.919791,-0.014687,-1.708317,1.340035,3.270451,3.650146,-1.451010,-0.947902
370,2.483487,2.346457,-0.494713,0.776841,-0.902042,-0.014687,-0.048326,2.317378,2.505582,2.873817,-0.719329,-0.906108
371,-1.886699,-1.978185,-3.696613,-5.371691,2.104232,0.055252,1.565614,-1.271511,-2.180232,-2.020070,-0.594483,-0.429904
