#### Importing packages and functions

In [104]:
# Importing functions from other files
from source.authorize import get_token
from source.get_data import get_activity_data

# Importing packages
import pandas as pd
import numpy as np
from datetime import datetime
import math

#### Setting intial variables

In [None]:
# Setting initial variables
ACTIVITIES_PER_PAGE = 200
page = 1
next_page = True

#### Getting the data using the strava api

In [34]:
# Getting the token to access the api
token = get_token()

# The maximum number of activities that can be extracted from one page is 200
# Therefore we extract 200 activities per page, until there are no more activities on further pages
# A boolean variable determines if there are entries in the current and the next pages
while next_page:
    
    # Setting the parameters for the get request
    params = {'per_page': ACTIVITIES_PER_PAGE, 'page': page}

    # Extracting the activity data
    data = get_activity_data(token, params=params)

    # Saving the activity data in a data frame and setting index
    df = pd.json_normalize(data)
    df.set_index(np.arange((page - 1) * ACTIVITIES_PER_PAGE, (page - 1) * ACTIVITIES_PER_PAGE + len(df)), inplace=True)

    # Check if there are entries for the current page and if not set the boolean variable accordingly
    if len(df) == 0:
        next_page = False
    else:

        # Adding the current data to the final activity data frame
        if page == 1:
            activities = df
        else:
            activities = activities.append(df)
    
    # Moving to the next page
    page += 1

# Determing the filename and filepath and saving the data as csv
timestamp = datetime.now().strftime('%d%m%Y')
filepath = f'data/{timestamp}_activitydata.csv'
activities.to_csv(filepath, index=False)   



Sucess
Sucess


  activities = activities.append(df)


Sucess


  activities = activities.append(df)


Sucess


#### First exploration and cleaning of the data

In [36]:
activities.head()

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id,start_date,...,athlete.id,athlete.resource_state,map.id,map.summary_polyline,map.resource_state,workout_type,average_cadence,average_watts,kilojoules,device_watts
0,2,Evening Swim,3150.0,5420,6328,0.0,Swim,Swim,11166733535,2024-04-12T18:04:35Z,...,43791935,1,a11166733535,,2,,,,,
1,2,Evening Workout,0.0,3882,3882,0.0,Workout,Workout,11152338412,2024-04-10T18:53:14Z,...,43791935,1,a11152338412,,2,,,,,
2,2,Afternoon Run,5370.4,1864,1867,22.0,Run,Run,11142753885,2024-04-09T15:47:51Z,...,43791935,1,a11142753885,et_mH{hsr@HLD?Tc@n@c@lDm@P?PEDEzAm@FId@Cj@SJNZ...,2,,77.8,,,
3,2,Afternoon Ride,22456.8,3526,3526,268.0,Ride,Ride,11071374180,2024-03-30T15:51:00Z,...,43791935,1,a11071374180,giihHypwx@rEoD`Bw@xAa@lBADcDCgEDs@Ew@PwBJWJGdC...,2,,,185.3,653.5,False
4,2,Afternoon Swim,2500.0,3315,3365,0.0,Swim,Swim,11071369686,2024-03-29T14:09:37Z,...,43791935,1,a11071369686,,2,,,,,


In [40]:
activities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 578
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   resource_state                 579 non-null    int64  
 1   name                           579 non-null    object 
 2   distance                       579 non-null    float64
 3   moving_time                    579 non-null    int64  
 4   elapsed_time                   579 non-null    int64  
 5   total_elevation_gain           579 non-null    float64
 6   type                           579 non-null    object 
 7   sport_type                     579 non-null    object 
 8   id                             579 non-null    int64  
 9   start_date                     579 non-null    object 
 10  start_date_local               579 non-null    object 
 11  timezone                       579 non-null    object 
 12  utc_offset                     579 non-null    flo

From the info we can extract the first findings:
- We have 56 from which only some are interesting for us
- The datatype of the colums seem to be correct
- There are several columns with missing values
- The two heartrate colums have missing values, since some workouts were recorded with my phone which does not provide heartrate data and some with my sportswatch

The next step is to extract the colums we actually need

In [58]:
result = activities[['name', 'distance', 'moving_time', 'elapsed_time', 'total_elevation_gain', 'sport_type', 'start_date', 'start_date_local', 'timezone', 'achievement_count', 'start_latlng', 'end_latlng', 'average_speed', 'max_speed', 'has_heartrate', 'average_heartrate', 'max_heartrate', 'elev_high', 'elev_low', 'average_cadence', 'average_watts']]
print(result.shape)

(579, 21)


Now we reduced the number of columns to 21

In [59]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 578
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  579 non-null    object 
 1   distance              579 non-null    float64
 2   moving_time           579 non-null    int64  
 3   elapsed_time          579 non-null    int64  
 4   total_elevation_gain  579 non-null    float64
 5   sport_type            579 non-null    object 
 6   start_date            579 non-null    object 
 7   start_date_local      579 non-null    object 
 8   timezone              579 non-null    object 
 9   achievement_count     579 non-null    int64  
 10  start_latlng          579 non-null    object 
 11  end_latlng            579 non-null    object 
 12  average_speed         579 non-null    float64
 13  max_speed             579 non-null    float64
 14  has_heartrate         579 non-null    bool   
 15  average_heartrate     5

The next step is to deal with the missing value in the heartrate columns and in the colums average_cadence and average_watts

Lets check, if there are just cadence values for running activities and only watt values for cycling activities

In [87]:
df_no_run = result.loc[result['sport_type'] != 'Run']
num_cadence = df_no_run.loc[df_no_run['average_cadence'] > 0].shape[0]
print(f'There are {num_cadence} non-running activities with a cadence')

There are 1 non-running activities with a cadence


In [85]:
df_no_run.loc[df_no_run['average_cadence'] > 0]

Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,sport_type,start_date,start_date_local,timezone,achievement_count,...,end_latlng,average_speed,max_speed,has_heartrate,average_heartrate,max_heartrate,elev_high,elev_low,average_cadence,average_watts
284,Lunch Walk,6615.1,5866,9183,85.0,Walk,2022-04-21T08:21:12Z,2022-04-21T11:21:12Z,(GMT+02:00) Europe/Tallinn,0,...,"[59.470855593681335, 25.637509049847722]",1.128,3.678,True,78.8,101.0,70.0,58.4,47.3,


There is on walking activitly with a value for average cadence. But there are no cycling or similar activities with a cadence value

In [92]:
df_run = result.loc[result['sport_type'] == 'Run']
nan_values = df_run.average_cadence.isna().sum()
print(f'There are {nan_values} nan values for cadence for running activities')

There are 23 nan values for cadence for running activities


In [103]:
print(f'Mean cadence: {np.mean(df_run.average_cadence)}')
print(f'Stadard deviation: {np.std(df_run.average_cadence)}')

Mean cadence: 79.94406779661014
Stadard deviation: 1.3041356883808923


In [109]:
nan_index = df_run.loc[df_run.average_cadence.isna()].index
replace_value = math.ceil(np.mean(df_run.average_cadence))
result.loc[nan_index, 'average_cadence'] = replace_value
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579 entries, 0 to 578
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  579 non-null    object 
 1   distance              579 non-null    float64
 2   moving_time           579 non-null    int64  
 3   elapsed_time          579 non-null    int64  
 4   total_elevation_gain  579 non-null    float64
 5   sport_type            579 non-null    object 
 6   start_date            579 non-null    object 
 7   start_date_local      579 non-null    object 
 8   timezone              579 non-null    object 
 9   achievement_count     579 non-null    int64  
 10  start_latlng          579 non-null    object 
 11  end_latlng            579 non-null    object 
 12  average_speed         579 non-null    float64
 13  max_speed             579 non-null    float64
 14  has_heartrate         579 non-null    bool   
 15  average_heartrate     5