In [84]:
import numpy as np
import pandas as pd
import os
from datetime import time, datetime, timedelta

Project of predicting the time of a running activity

# Load data

In [54]:
# Extract project data
PROJECT_URL = 'https://raw.githubusercontent.com/AdrianSzymczyk/running-ml-project/main/data/activity_log.csv'
data = pd.read_csv(PROJECT_URL)
data.head()

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Avg Run Cadence,Max Run Cadence,Avg Pace,Best Pace,Elev Gain,Elev Loss,Avg Stride Length,Best Lap Time,Number of Laps
0,Running,7/15/20 9:41,Cherry Hill Running,6.0,530,0:43:55,141,160,176,182,7:19,6:20,169,173,1.26,00:02.3,7
1,Running,7/14/20 17:45,Cherry Hill Running,6.5,587,0:47:04,144,160,172,182,7:14,6:35,183,187,1.29,03:32.7,7
2,Running,7/13/20 18:57,Cherry Hill Running,5.01,392,0:40:29,128,151,170,180,8:05,5:49,124,124,1.17,00:04.1,6
3,Running,7/12/20 18:44,Cherry Hill Running,7.01,633,0:52:55,142,157,172,180,7:33,5:00,215,219,1.24,00:05.1,8
4,Running,7/11/20 19:35,Cherry Hill Running,5.19,419,0:41:35,129,143,170,178,8:01,6:48,76,80,1.18,01:27.1,6


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Activity Type      689 non-null    object 
 1   Date               689 non-null    object 
 2   Title              689 non-null    object 
 3   Distance           689 non-null    float64
 4   Calories           689 non-null    object 
 5   Time               689 non-null    object 
 6   Avg HR             689 non-null    object 
 7   Max HR             689 non-null    object 
 8   Avg Run Cadence    689 non-null    object 
 9   Max Run Cadence    689 non-null    object 
 10  Avg Pace           689 non-null    object 
 11  Best Pace          689 non-null    object 
 12  Elev Gain          689 non-null    object 
 13  Elev Loss          689 non-null    object 
 14  Avg Stride Length  689 non-null    float64
 15  Best Lap Time      689 non-null    object 
 16  Number of Laps     689 non

# Preprocessing

### Delete rows with missing values 

In [None]:
data = data[~data[['Distance', 'Avg HR', 'Max HR', 'Avg Pace']].isin(['--']).any(axis=1)]

In [98]:
data[~data['Time'].apply(lambda x: x[-3] == ':')]

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Avg Run Cadence,Max Run Cadence,Avg Pace,Best Pace,Elev Gain,Elev Loss,Avg Stride Length,Best Lap Time,Number of Laps
91,Running,3/18/20 17:11,Cherry Hill Running,1.93,58,09:18.1,102,140,170,178,290,213,19,3,1.21,01:38.3,2
123,Running,8/24/19 19:48,Cherry Hill Running,1.26,63,05:57.7,129,150,170,212,286,149,7,7,1.23,00:00.0,1
124,Running,8/24/19 19:27,Cherry Hill Running,1.19,74,07:51.0,121,128,162,168,395,321,12,12,0.94,00:00.0,1
157,Running,4/30/19 17:20,Baltimore Running,1.27,81,06:31.1,145,152,168,208,307,277,43,46,1.16,00:00.0,1
158,Running,4/30/19 17:13,Baltimore Running,1.66,73,05:14.4,163,181,188,208,190,143,37,43,1.67,00:06.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,Running,7/23/18 18:11,Baltimore Running,0.45,22,02:12.7,117,134,176,182,291,72,35,--,1.17,00:00.0,1
591,Running,7/18/18 18:57,Baltimore Running,1.63,63,05:15.2,134,143,186,194,194,183,10,5,1.66,00:00.0,2
601,Running,7/10/18 18:46,Baltimore Running,0.53,31,02:21.8,141,154,174,186,271,227,--,--,1.27,00:00.0,1
611,Running,7/3/18 21:05,Baltimore Running,1.61,78,05:22.2,160,175,184,190,201,183,22,34,1.63,00:00.0,1


## Delete other workouts that running

In [57]:
data.drop(data[data['Activity Type'] != 'Running'].index, inplace=True)

In [58]:
data = data.reset_index(drop=True)

### Convert data units from Miles to Kilometers

In [59]:
mile = 1.60934 

In [66]:
data.head(2)

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Avg Run Cadence,Max Run Cadence,Avg Pace,Best Pace,Elev Gain,Elev Loss,Avg Stride Length,Best Lap Time,Number of Laps
0,Running,7/15/20 9:41,Cherry Hill Running,9.66,530,0:43:55,141,160,176,182,273,236,169,173,1.26,00:02.3,7
1,Running,7/14/20 17:45,Cherry Hill Running,10.46,587,0:47:04,144,160,172,182,270,245,183,187,1.29,03:32.7,7


#### Distance conversion

In [61]:
data['Distance'] = data['Distance'].apply(lambda x: round(x * mile, 2))

#### Pace conversion 

In [62]:
def pace_to_km(pace):
    mile = 1.60934 
    minutes, seconds = pace.split(':')
    pace_in_seconds = int(minutes) * 60 + int(seconds)
    pace_per_km = round(pace_in_seconds / mile)
    return pace_per_km

#### Avg Pace conversion

In [63]:
data['Avg Pace'] = data['Avg Pace'].apply(pace_to_km)

#### Best Pace conversion

In [64]:
data['Best Pace'] = data['Best Pace'].apply(pace_to_km)

### Convert Time into numeric representation as total number of seconds

In [82]:
data['Time'][0]

'0:43:55'

In [86]:
data['Time'].apply(lambda x: datetime.strptime(x,'%H:%M:%S').time())

ValueError: time data '09:18.1' does not match format '%H:%M:%S'

In [73]:
pd.to_timedelta(data['Time'].astype(str)).dt.total_seconds()

ValueError: expected hh:mm:ss format before .