# Garmin Data Exploration and Wrangling

### Preliminary Work

Firstly, we import the required packages then import the dataset and view the first few rows.

In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Now, read the csv file and view the head of the data.

In [6]:
garmin = pd.read_csv("activities.csv")
garmin.head()

Unnamed: 0,Activity Type,Date,Favorite,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,...,Min Temp,Surface Interval,Decompression,Best Lap Time,Number of Laps,Max Temp,Moving Time,Elapsed Time,Min Elevation,Max Elevation
0,Hiking,2022-12-21 10:40:56,False,Monte Castello di Vibio Hiking,6.3,472,01:32:43,99,122,1.0,...,21.0,0:00,No,00:03.93.8,3,0.0,01:22:39,01:32:43,-59,-27
1,Mountain Biking,2022-12-14 09:17:47,False,Teignbridge Mountain Biking,12.81,916,01:08:34,153,188,4.2,...,15.0,0:00,No,14:19.80.2,3,0.0,01:00:25,01:08:34,158,267
2,Running,2022-12-12 14:38:08,False,Exeter Running,9.73,747,01:03:58,138,156,3.2,...,9.0,0:00,No,04:43.60.8,10,0.0,01:02:58,01:03:58,6,40
3,Running,2022-12-07 15:48:15,False,Exeter Running,5.66,444,00:41:19,127,151,2.9,...,15.0,0:00,No,05:35.67.4,6,0.0,00:34:22,00:41:19,8,23
4,Running,2022-12-05 12:38:49,False,Exeter Running,5.11,397,00:36:00,129,150,2.7,...,18.0,0:00,No,00:49.93.8,6,0.0,00:33:52,00:36:00,4,17


Let's view the general info of the dataset

In [14]:
print("Number of columns in dataset:", len(garmin))
print()
print(garmin.info())

Number of columns in dataset: 160

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   activity            160 non-null    object 
 1   date                160 non-null    object 
 2   fav                 160 non-null    bool   
 3   title               160 non-null    object 
 4   distance            160 non-null    object 
 5   calories            160 non-null    object 
 6   time                160 non-null    object 
 7   avg_hr              160 non-null    int64  
 8   max_hr              160 non-null    int64  
 9   aerobic_te          160 non-null    object 
 10  avg_run_cadence     160 non-null    int64  
 11  max_run_cadence     160 non-null    int64  
 12  avg_pace            160 non-null    object 
 13  best_pace           160 non-null    object 
 14  total_ascent        160 non-null    object 
 15  total_descent       16

### Change Column Names

View column names and change them to suitable, easy to manage names.

In [8]:
garmin.columns = ['activity', 'date', 'fav', 'title', 'distance', 'calories', 
                  'time', 'avg_hr', 'max_hr', 'aerobic_te', 'avg_run_cadence',
                  'max_run_cadence', 'avg_pace', 'best_pace', 'total_ascent',
                  'total_descent', 'avg_stride', 'avg_vert_ratio', 
                  'avg_vert_osc', 'avg_ground_contact',
                  'avg_run_cadence1', 'max_run_cadence1', 'training_stress', 'max_avg_power20',
                  'avg_power', 'max_power', 'grit', 'flow', 'total_strokes', 'avg_swolf',
                  'avg_stroke_rate', 'total_reps', 'dive_time', 'min_temp',
                  'surface_interval', 'decomp', 'best_lap', 'number_laps',
                  'max_temp', 'moving_time', 'elapsed_time', 'min_elav', 'max_elav' ]
garmin.columns

Index(['activity', 'date', 'fav', 'title', 'distance', 'calories', 'time',
       'avg_hr', 'max_hr', 'aerobic_te', 'avg_run_cadence', 'max_run_cadence',
       'avg_pace', 'best_pace', 'total_ascent', 'total_descent', 'avg_stride',
       'avg_vert_ratio', 'avg_vert_osc', 'avg_ground_contact',
       'avg_run_cadence1', 'max_run_cadence1', 'training_stress',
       'max_avg_power20', 'avg_power', 'max_power', 'grit', 'flow',
       'total_strokes', 'avg_swolf', 'avg_stroke_rate', 'total_reps',
       'dive_time', 'min_temp', 'surface_interval', 'decomp', 'best_lap',
       'number_laps', 'max_temp', 'moving_time', 'elapsed_time', 'min_elav',
       'max_elav'],
      dtype='object')

In [9]:
garmin.head()
garmin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   activity            160 non-null    object 
 1   date                160 non-null    object 
 2   fav                 160 non-null    bool   
 3   title               160 non-null    object 
 4   distance            160 non-null    object 
 5   calories            160 non-null    object 
 6   time                160 non-null    object 
 7   avg_hr              160 non-null    int64  
 8   max_hr              160 non-null    int64  
 9   aerobic_te          160 non-null    object 
 10  avg_run_cadence     160 non-null    int64  
 11  max_run_cadence     160 non-null    int64  
 12  avg_pace            160 non-null    object 
 13  best_pace           160 non-null    object 
 14  total_ascent        160 non-null    object 
 15  total_descent       160 non-null    object 
 16  avg_stri

### Create 2022 Dataframe

Convert date column to datetime type and create sub-df with only 2022 activites.

In [15]:
garmin["date"] = pd.to_datetime(garmin["date"])
#garmin.dtypes # Check each column type

In [16]:
garmin["year"] = garmin["date"].dt.year
garmin22 = garmin[garmin["year"] == 2022]
garmin22.head()

Unnamed: 0,activity,date,fav,title,distance,calories,time,avg_hr,max_hr,aerobic_te,...,surface_interval,decomp,best_lap,number_laps,max_temp,moving_time,elapsed_time,min_elav,max_elav,year
0,Hiking,2022-12-21 10:40:56,False,Monte Castello di Vibio Hiking,6.3,472,01:32:43,99,122,1.0,...,0:00,No,00:03.93.8,3,0.0,01:22:39,01:32:43,-59,-27,2022
1,Mountain Biking,2022-12-14 09:17:47,False,Teignbridge Mountain Biking,12.81,916,01:08:34,153,188,4.2,...,0:00,No,14:19.80.2,3,0.0,01:00:25,01:08:34,158,267,2022
2,Running,2022-12-12 14:38:08,False,Exeter Running,9.73,747,01:03:58,138,156,3.2,...,0:00,No,04:43.60.8,10,0.0,01:02:58,01:03:58,6,40,2022
3,Running,2022-12-07 15:48:15,False,Exeter Running,5.66,444,00:41:19,127,151,2.9,...,0:00,No,05:35.67.4,6,0.0,00:34:22,00:41:19,8,23,2022
4,Running,2022-12-05 12:38:49,False,Exeter Running,5.11,397,00:36:00,129,150,2.7,...,0:00,No,00:49.93.8,6,0.0,00:33:52,00:36:00,4,17,2022


### Obtain Counts of Activities

In [46]:
print("Number of activites in 2022:", garmin22.count()[0])
garmin22["activity"].value_counts()

Number of activites in 2022: 149


Running                       51
Pool Swimming                 38
Hiking                        13
Mountain Biking               11
Indoor Cycling                10
Treadmill Running              7
Resort Skiing/Snowboarding     7
Cardio                         6
Road Cycling                   5
Cycling                        1
Name: activity, dtype: int64