# Steps Analysis (02/17/17 - 10/24/18)

In [161]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil

In [162]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [163]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [164]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [165]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [166]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [141]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [142]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [143]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [178]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date']+dur_df['start_time']
dur_df['edt'] = dur_df['end_date']+dur_df['end_time']
dur_df['duration'] = dur_df['edt']-dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])
dur_df.head()

Unnamed: 0,start_date,start_time,end_date,end_time,duration
0,2017-02-17,21:23:24,2017-02-17,21:30:42,00:07:18
1,2017-02-17,21:36:28,2017-02-17,21:44:19,00:07:51
2,2017-02-17,21:44:19,2017-02-17,21:51:52,00:07:33
3,2017-02-17,21:52:34,2017-02-17,21:58:37,00:06:03
4,2017-02-17,21:58:37,2017-02-17,22:07:35,00:08:58


In [180]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [196]:
df = pd.concat([steps_df, dur_df], axis=1)
df.columns

Index(['num_steps', 'source', 'start_date', 'start_time', 'end_date',
       'end_time', 'duration'],
      dtype='object')

In [197]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [198]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)
df.tail(20)

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source
27715,2018-10-24,13:18:52,2018-10-24,13:23:19,59,00:04:27,iPhone
27744,2018-10-24,14:00:59,2018-10-24,15:00:59,46,01:00:00,Connected
27745,2018-10-24,15:00:59,2018-10-24,16:00:59,164,01:00:00,Connected
27746,2018-10-24,16:00:59,2018-10-24,17:00:59,304,01:00:00,Connected
27716,2018-10-24,16:06:19,2018-10-24,16:15:56,153,00:09:37,iPhone
27717,2018-10-24,16:21:58,2018-10-24,16:22:29,76,00:00:31,iPhone
27747,2018-10-24,17:00:59,2018-10-24,17:49:59,790,00:49:00,Connected
27718,2018-10-24,17:02:07,2018-10-24,17:11:02,105,00:08:55,iPhone
27719,2018-10-24,17:17:57,2018-10-24,17:27:57,218,00:10:00,iPhone
27720,2018-10-24,17:28:38,2018-10-24,17:38:37,381,00:09:59,iPhone


In [199]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000

In [200]:
# for num in range(len(df)):
#     if df['end_date'][num] != df['start_date'][num]:
#         df['duration'][num] = df['end_time'][num]+24 - df['start_time'][num]
# df.head()

In [207]:
# for num in range(len(df)):
#     if df.start_date[num] != df.end_date[num]:
#         total_steps = df.num_steps[num]
#         steps_per_min = total_steps/df.duration[num]
#         time_left_in_start_date = 24-df.start_time[num]
#         steps_for_start_date = ceil(time_left_in_start_date*60*steps_per_min)
#         steps_for_end_date = total_steps-steps_for_start_date
#         df.num_steps[num] = steps_for_start_date
#         df.num_steps[num+1] += steps_for_end_date
# df[5050:5100]

In [211]:
df.groupby('end_date').sum().head(60)

Unnamed: 0_level_0,start_time,end_time,num_steps,duration
end_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-02-17,176.142778,177.204167,812,1.061389
2017-02-18,475.579722,478.716944,2668,3.137222
2017-02-19,570.570556,574.096111,2991,3.525556
2017-02-20,629.371389,633.515833,2812,4.144444
2017-02-21,531.381667,535.065833,2819,3.684167
2017-02-22,450.330278,453.641944,2559,3.311667
2017-02-23,623.042778,628.317778,4396,5.275
2017-02-24,516.233333,520.597778,3060,4.364444
2017-02-25,852.039444,858.344444,5420,6.305
2017-02-26,519.848333,499.372222,1611,3.523889
