In [91]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
from scipy import stats



In [79]:
cols = ['Activity ID',
 'Activity Date',
 'Activity Name',
 'Activity Type',
 'Activity Description',
 'Elapsed Time',
 'Distance',
 'Max Heart Rate',
 'Relative Effort',
 'Activity Gear',
 'Elapsed Time.1',
 'Moving Time',
 'Distance.1',
 'Max Speed',
 'Average Speed',
 'Elevation Gain',
 'Elevation Loss',
 'Elevation Low',
 'Elevation High',
 'Max Grade',
 'Average Grade',
 'Average Positive Grade',
 'Average Negative Grade',
 'Max Cadence',
 'Average Cadence',
 'Max Heart Rate.1',
 'Average Heart Rate',
 'Calories',
 'Max Temperature',
 'Average Temperature',
 'Relative Effort.1',
 'Number of Runs',
 'Uphill Time',
 'Downhill Time',
 'Other Time',
 'Gear']
df = pd.read_csv('data/activities.csv', usecols=cols)

In [80]:
df.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Activity Description,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Activity Gear,...,Average Heart Rate,Calories,Max Temperature,Average Temperature,Relative Effort.1,Number of Runs,Uphill Time,Downhill Time,Other Time,Gear
0,1927719,"Sep 28, 2011, 4:37:41 AM","09/28/2011 Istres, Provence-Alpes-Côte d'Azur,...",Run,,1253,3.54,,,,...,,371.595459,,,,,,,,
1,1927721,"Sep 30, 2011, 4:35:32 AM","09/30/2011 Istres, Provence-Alpes-Côte d'Azur,...",Run,,746,2.44,,,,...,,259.392212,,,,,,,,
2,1927722,"Sep 26, 2011, 4:34:33 AM","09/26/2011 Istres, Provence-Alpes-Côte d'Azur,...",Run,,1252,3.52,,,,...,,373.191833,,,,,,,,
3,1927723,"Oct 10, 2011, 6:42:14 AM","10/10/2011 Cambridge, Cambridgeshire, United K...",Run,,2305,6.7,,,,...,,693.651062,,,,,,,,
4,1927724,"Sep 23, 2011, 4:35:12 AM","09/23/2011 Istres, Provence-Alpes-Côte d'Azur,...",Run,,1334,3.63,,,,...,,379.204193,,,,,,,,


In [81]:
#  CONVERT DISTANCE FROM KM TO MILES
df['Distance'] = df['Distance'] * 0.621371

In [82]:
# DROP ERRONEOUS ACTIVITIES WHERE DISTANCE IS NEARLY ZERO, and round to two places
df.sort_values(by=['Distance'], ascending=True)
df['Distance'] = round(df['Distance'],2)

df = df.drop([5, 551, 554, 576, 52])

In [83]:
df.sort_values(by=['Distance'], ascending=True)


Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Activity Description,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Activity Gear,...,Average Heart Rate,Calories,Max Temperature,Average Temperature,Relative Effort.1,Number of Runs,Uphill Time,Downhill Time,Other Time,Gear
624,484170825,"Feb 2, 2016, 12:28:18 PM",Morning Run,Run,,142,0.25,140.0,0.0,,...,129.654922,44.203156,,,0.0,,,,,
501,216674441,"Oct 23, 2014, 12:35:11 PM",10/23/2014,Run,,188,0.38,,,,...,,63.585976,,,,,,,,
1267,2919881466,"Dec 8, 2019, 12:35:57 PM",Morning Run,Run,,206,0.39,146.0,2.0,Second black pair,...,137.000000,49.000000,,21.0,2.0,,,,,5001432.0
546,284388925,"Apr 2, 2015, 12:38:39 PM","04/02/2015 San Antonio, TX",Run,,231,0.40,,,,...,,67.666794,,,,,,,,
888,1772747527,"Aug 14, 2018, 11:48:38 PM",Warm up,Run,,267,0.50,148.0,1.0,,...,136.296585,57.000000,,33.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1823,6534195503,"Jan 16, 2022, 1:02:03 PM",2022 Houston Marathon,Run,3:09:36 chip time. Felt like I had the potenti...,11380,26.54,185.0,470.0,,...,166.256088,3275.000000,,8.0,470.0,,,,,10291309.0
1742,6098139337,"Oct 11, 2021, 1:28:47 PM",Boston marathon 2021,Run,First Boston in the books and it was everythin...,11574,26.62,184.0,387.0,,...,160.285385,3018.000000,,20.0,387.0,,,,,9301265.0
198,23899471,"Sep 30, 2012, 7:04:17 AM","09/30/2012 Berlin, Berlin, Germany",Run,,15618,26.64,,,,...,,4573.556641,,,,,,,,
1009,2092243354,"Jan 20, 2019, 1:01:34 PM",Houston Marathon 2019 - 37 minute PR!,Run,Felt really strong and had to keep my pace in ...,12081,26.66,186.0,448.0,,...,163.055252,3396.000000,,6.0,448.0,,,,,4072040.0


In [100]:
# convert time from seconds to HH:MM:SS
secs = 600
def secs_to_time(secs):
    return str(datetime.timedelta(seconds = secs))
    
df['Elapsed Time'] = df['Elapsed Time'].apply(secs_to_time)

df.sort_values(by=['Distance'], ascending=True)


Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Activity Description,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Activity Gear,...,Average Heart Rate,Calories,Max Temperature,Average Temperature,Relative Effort.1,Number of Runs,Uphill Time,Downhill Time,Other Time,Gear
624,484170825,"Feb 2, 2016, 12:28:18 PM",Morning Run,Run,,0:02:22,0.25,140.0,0.0,,...,129.654922,44.203156,,,0.0,,,,,
501,216674441,"Oct 23, 2014, 12:35:11 PM",10/23/2014,Run,,0:03:08,0.38,,,,...,,63.585976,,,,,,,,
1267,2919881466,"Dec 8, 2019, 12:35:57 PM",Morning Run,Run,,0:03:26,0.39,146.0,2.0,Second black pair,...,137.000000,49.000000,,21.0,2.0,,,,,5001432.0
546,284388925,"Apr 2, 2015, 12:38:39 PM","04/02/2015 San Antonio, TX",Run,,0:03:51,0.40,,,,...,,67.666794,,,,,,,,
888,1772747527,"Aug 14, 2018, 11:48:38 PM",Warm up,Run,,0:04:27,0.50,148.0,1.0,,...,136.296585,57.000000,,33.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1823,6534195503,"Jan 16, 2022, 1:02:03 PM",2022 Houston Marathon,Run,3:09:36 chip time. Felt like I had the potenti...,3:09:40,26.54,185.0,470.0,,...,166.256088,3275.000000,,8.0,470.0,,,,,10291309.0
1742,6098139337,"Oct 11, 2021, 1:28:47 PM",Boston marathon 2021,Run,First Boston in the books and it was everythin...,3:12:54,26.62,184.0,387.0,,...,160.285385,3018.000000,,20.0,387.0,,,,,9301265.0
198,23899471,"Sep 30, 2012, 7:04:17 AM","09/30/2012 Berlin, Berlin, Germany",Run,,4:20:18,26.64,,,,...,,4573.556641,,,,,,,,
1009,2092243354,"Jan 20, 2019, 1:01:34 PM",Houston Marathon 2019 - 37 minute PR!,Run,Felt really strong and had to keep my pace in ...,3:21:21,26.66,186.0,448.0,,...,163.055252,3396.000000,,6.0,448.0,,,,,4072040.0
