#### Importing packages and functions

In [368]:
# Importing functions from other files
from source.authorize import get_token
from source.get_data import get_activity_data

# Importing packages
import pandas as pd
import numpy as np
from datetime import datetime
import math

#### Setting intial variables

In [369]:
# Setting initial variables
ACTIVITIES_PER_PAGE = 200
page = 1
next_page = True

#### Getting the data using the strava api

In [370]:
# Getting the token to access the api
token = get_token()

# The maximum number of activities that can be extracted from one page is 200
# Therefore we extract 200 activities per page, until there are no more activities on further pages
# A boolean variable determines if there are entries in the current and the next pages
while next_page:
    
    # Setting the parameters for the get request
    params = {'per_page': ACTIVITIES_PER_PAGE, 'page': page}

    # Extracting the activity data
    data = get_activity_data(token, params=params)

    # Saving the activity data in a data frame and setting index
    df_data = pd.json_normalize(data)
    df_data.set_index(np.arange((page - 1) * ACTIVITIES_PER_PAGE, (page - 1) * ACTIVITIES_PER_PAGE + len(data)), inplace=True)

    # Check if there are entries for the current page and if not set the boolean variable accordingly
    if len(df_data) == 0:
        next_page = False
    else:

        # Adding the current data to the final activity data frame
        if page == 1:
            df = df_data
        else:
            df = df.append(df_data)
    
    # Moving to the next page
    page += 1

# Determing the filename and filepath and saving the data as csv
timestamp = datetime.now().strftime('%d%m%Y')
filepath = f'data/{timestamp}_activitydata.csv'
df.to_csv(filepath, index=False)   



Sucess
Sucess


  df = df.append(df_data)


Sucess


  df = df.append(df_data)


Sucess


#### First exploration and cleaning of the data

In [371]:
df.head()

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id,start_date,...,athlete.id,athlete.resource_state,map.id,map.summary_polyline,map.resource_state,workout_type,average_cadence,average_watts,kilojoules,device_watts
0,2,Evening Swim,3150.0,5420,6328,0.0,Swim,Swim,11166733535,2024-04-12T18:04:35Z,...,43791935,1,a11166733535,,2,,,,,
1,2,Evening Workout,0.0,3882,3882,0.0,Workout,Workout,11152338412,2024-04-10T18:53:14Z,...,43791935,1,a11152338412,,2,,,,,
2,2,Afternoon Run,5370.4,1864,1867,22.0,Run,Run,11142753885,2024-04-09T15:47:51Z,...,43791935,1,a11142753885,et_mH{hsr@HLD?Tc@n@c@lDm@P?PEDEzAm@FId@Cj@SJNZ...,2,,77.8,,,
3,2,Afternoon Ride,22456.8,3526,3526,268.0,Ride,Ride,11071374180,2024-03-30T15:51:00Z,...,43791935,1,a11071374180,giihHypwx@rEoD`Bw@xAa@lBADcDCgEDs@Ew@PwBJWJGdC...,2,,,185.3,653.5,False
4,2,Afternoon Swim,2500.0,3315,3365,0.0,Swim,Swim,11071369686,2024-03-29T14:09:37Z,...,43791935,1,a11071369686,,2,,,,,


In [372]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 578
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   resource_state                 579 non-null    int64  
 1   name                           579 non-null    object 
 2   distance                       579 non-null    float64
 3   moving_time                    579 non-null    int64  
 4   elapsed_time                   579 non-null    int64  
 5   total_elevation_gain           579 non-null    float64
 6   type                           579 non-null    object 
 7   sport_type                     579 non-null    object 
 8   id                             579 non-null    int64  
 9   start_date                     579 non-null    object 
 10  start_date_local               579 non-null    object 
 11  timezone                       579 non-null    object 
 12  utc_offset                     579 non-null    flo

From the info we can extract the first findings:
- We have 56 from which only some are interesting for us
- The datatype of the colums seem to be correct
- There are several columns with missing values
- The two heartrate colums have missing values, since some workouts were recorded with my phone which does not provide heartrate data and some with my sportswatch

The next step is to extract the colums we actually need

In [373]:
df = df[['name', 'distance', 'moving_time', 'elapsed_time', 'total_elevation_gain', 'sport_type', 'start_date', 'start_date_local', 'timezone', 'achievement_count', 'start_latlng', 'end_latlng', 'average_speed', 'max_speed', 'has_heartrate', 'average_heartrate', 'max_heartrate', 'average_cadence', 'average_watts']]
print(df.shape)

(579, 19)


Now we reduced the number of columns to 21

In [374]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 578
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  579 non-null    object 
 1   distance              579 non-null    float64
 2   moving_time           579 non-null    int64  
 3   elapsed_time          579 non-null    int64  
 4   total_elevation_gain  579 non-null    float64
 5   sport_type            579 non-null    object 
 6   start_date            579 non-null    object 
 7   start_date_local      579 non-null    object 
 8   timezone              579 non-null    object 
 9   achievement_count     579 non-null    int64  
 10  start_latlng          579 non-null    object 
 11  end_latlng            579 non-null    object 
 12  average_speed         579 non-null    float64
 13  max_speed             579 non-null    float64
 14  has_heartrate         579 non-null    bool   
 15  average_heartrate     5

The next step is to deal with the missing value in the heartrate columns and in the colums average_cadence and average_watts

Lets check, if there are just cadence values for running activities and only watt values for cycling activities

In [375]:
# Filtering for all activities that are not runs and have a value for average cadence
df_no_run_cadence = df.loc[(df.sport_type != 'Run') & (df.average_cadence > 0)]
num_cadence = len(df_no_run_cadence)

print(f'There are {num_cadence} non-running activities with a cadence value')

There are 1 non-running activities with a cadence value


In [376]:
df_no_run_cadence

Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,sport_type,start_date,start_date_local,timezone,achievement_count,start_latlng,end_latlng,average_speed,max_speed,has_heartrate,average_heartrate,max_heartrate,average_cadence,average_watts
284,Lunch Walk,6615.1,5866,9183,85.0,Walk,2022-04-21T08:21:12Z,2022-04-21T11:21:12Z,(GMT+02:00) Europe/Tallinn,0,"[59.46718909777701, 25.648776507005095]","[59.470855593681335, 25.637509049847722]",1.128,3.678,True,78.8,101.0,47.3,


There is one walking activitly with a value for average cadence. But there are no cycling or similar activities with a cadence value

In [377]:
# Creating a data frame with all running activities
df_run = df.loc[df.sport_type == 'Run']

# Counting all nan values
nan_values = df_run.average_cadence.isna().sum()

print(f'There are {nan_values} nan values for cadence for running activities')

There are 23 nan values for cadence for running activities


In [378]:
# Getting the earlies run with average cadence values and the latest run without average cadence values
min_date = min(df_run.loc[df_run.average_cadence > 0].start_date_local)
max_date = max(df_run.loc[df_run.average_cadence.isna()].start_date_local)

# State if the earliest run with cadence value is later than the later than the latest run without cadence value
if min_date > max_date:
    print(f'all early running activities do not provide average cadence value')

all early running activities do not provide average cadence value


In [379]:
# Filtering entries with a positive cadence value
df_positive_cadence = df_run.loc[df_run['average_cadence'] > 0]

# Calculating the coefficient of correlation
correlation_coef = np.corrcoef(df_positive_cadence.average_speed, df_positive_cadence.average_cadence)[1,0]

print(f'The correlation coefficient between the speed and cadence is {round(correlation_coef, 4)}')

The correlation coefficient between the speed and cadence is 0.7478


In [380]:
df.loc[df.sport_type == 'Run']

Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,sport_type,start_date,start_date_local,timezone,achievement_count,start_latlng,end_latlng,average_speed,max_speed,has_heartrate,average_heartrate,max_heartrate,average_cadence,average_watts
2,Afternoon Run,5370.4,1864,1867,22.0,Run,2024-04-09T15:47:51Z,2024-04-09T17:47:51Z,(GMT+01:00) Europe/Berlin,0,"[49.48442188091576, 8.463334869593382]","[49.48433294892311, 8.462995318695903]",2.881,4.270,True,160.6,173.0,77.8,
57,Lunch Run,5026.5,1626,1633,8.0,Run,2023-09-16T10:49:02Z,2023-09-16T12:49:02Z,(GMT+01:00) Europe/Berlin,12,[],[],3.091,5.786,True,171.3,188.0,79.3,
61,Afternoon Run,4565.7,1598,1598,0.0,Run,2023-08-31T15:08:52Z,2023-08-31T17:08:52Z,(GMT+02:00) Africa/Blantyre,0,[],[],2.857,3.023,True,168.3,180.0,79.3,
79,Evening Run,3166.6,1080,1080,48.0,Run,2023-05-24T17:41:28Z,2023-05-24T19:41:28Z,(GMT+01:00) Europe/Berlin,0,"[48.71102141216397, 9.457216691225767]","[48.71141946874559, 9.458307009190321]",2.932,4.046,True,149.5,161.0,78.0,
81,Morning Run,8223.9,2357,2357,36.0,Run,2023-05-21T07:43:51Z,2023-05-21T09:43:51Z,(GMT+01:00) Europe/Berlin,0,"[48.81544089876115, 9.227658156305552]","[48.815808445215225, 9.227486411109567]",3.489,4.832,True,178.2,193.0,82.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,Afternoon Run,5853.5,1923,1930,114.8,Run,2020-11-21T14:44:21Z,2020-11-21T15:44:21Z,(GMT+01:00) Europe/Berlin,0,"[48.716844, 9.466551]","[48.738663, 9.465051]",3.044,5.900,True,159.7,177.0,,
567,Lunch Run,10015.4,2824,2824,54.5,Run,2020-11-17T11:00:09Z,2020-11-17T12:00:09Z,(GMT+01:00) Europe/Berlin,8,"[48.718209, 9.464089]","[48.717987, 9.463723]",3.547,4.800,True,170.5,183.0,,
570,Lunch Run,8369.2,2480,2535,50.4,Run,2020-11-09T11:51:00Z,2020-11-09T12:51:00Z,(GMT+01:00) Europe/Berlin,8,"[48.71829, 9.463682]","[48.715654, 9.463648]",3.375,6.600,True,164.7,191.0,,
573,Lunch Run,6802.6,2090,2090,116.4,Run,2020-11-02T11:52:14Z,2020-11-02T12:52:14Z,(GMT+01:00) Europe/Berlin,6,"[48.716132, 9.464053]","[48.717844, 9.465299]",3.255,7.700,True,160.5,182.0,,


In [455]:
num_bins = 5
upper_bound_speed = 10

# Only correct values if there are nan values in the average cadence column for running activities
if df.loc[df.sport_type == 'Run'].average_cadence.isna().sum() > 0:

    # Extracting 5 bins for average speed values with equal size and saing the information in an extra colum
    df_positive_cadence.loc[:,'bins speed'] , bins_speed = pd.cut(df_positive_cadence.average_speed, num_bins, retbins=True)

    # Saving the bins in a list
    bins_speed = [[bins_speed[index-1], bins_speed[index]] if index > 0 else [0, bins_speed[index]] for index in range(num_bins)]
    bins_speed[num_bins-1][1] = upper_bound_speed # Correction the last bin to include high average speeds

    # Grouping after the bins and calculation the average cadence for each bin
    df_speed_grouped = df_positive_cadence.groupby(['bins speed']).mean()['average_cadence']

    # Round the average cadences
    average_cadences = [round(val, 1) for val in df_speed_grouped.values]

    # Defining the conditions when to assign which average cadence to which bin
    condition_list = [(df.average_speed > bins_speed[i][0]) & (df.average_speed < bins_speed[i][1]) & (df.sport_type == 'Run') & (df.average_cadence.isna()) for i in range(num_bins)]

    # Saving the corresponding average cadence to the bin in the dataframe
    df.average_cadence = np.select(condition_list, average_cadences, df.average_cadence)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_positive_cadence.loc[:,'bins speed'] , bins_speed = pd.cut(df_positive_cadence.average_speed, num_bins, retbins=True)


In [457]:
df.loc[df.sport_type == 'Run'].isna().sum()

name                     0
distance                 0
moving_time              0
elapsed_time             0
total_elevation_gain     0
sport_type               0
start_date               0
start_date_local         0
timezone                 0
achievement_count        0
start_latlng             0
end_latlng               0
average_speed            0
max_speed                0
has_heartrate            0
average_heartrate        0
max_heartrate            0
average_cadence          0
average_watts           82
dtype: int64