# Steps Analysis (02/17/17 - 10/24/18)

In [1]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil

In [2]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [3]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [4]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [5]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [6]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [7]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [8]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [9]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [10]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date'] + dur_df['start_time']
dur_df['edt'] = dur_df['end_date'] + dur_df['end_time']
dur_df['duration'] = dur_df['edt'] - dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])

In [28]:
df = pd.concat([steps_df, dur_df], axis=1)

In [29]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [30]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)

In [31]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000
print(len(df))
print(df.num_steps.sum())

27755
4952676


In [32]:
# i = 0
# while i < len(df):
#     if (((i in df.index) and (i+1 in df.index)) and (
#             df['source'][i] == df['source'][i+1])) and (
#             df['end_time'][i] == df['start_time'][i+1]):
#         df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
#         df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
#         df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
#         df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
#         df.drop(index=(i+1), inplace=True)
#         df.reset_index(inplace=True)
#         df.drop(columns=['index'], inplace=True)
#     else:
#         i += 1

# print(len(df))
# print(df.num_steps.sum())

In [34]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
            df.start_time[i] <= df.start_time[i+1])) and (
            df.end_time[i] >= df.end_time[i+1]):
        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)
    else:
        i += 1

print(len(df))
print(df.num_steps.sum())

25249
4625251


In [35]:
df['steps_per_hour'] = df['num_steps']/df['duration']

In [36]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
            df.start_date[i] == df.start_date[i+1])) and (
            df.start_time[i+1] <= df.end_time[i]):

        steps_adjust = ceil((df.end_time[i] - df.start_time[i+1]
                             ) * df.steps_per_hour[i+1])

        df.loc[(i+1), 'num_steps'] -= steps_adjust
        df.loc[(i+1), 'start_time'] = df.loc[i, 'end_time']
        df.loc[(i+1), 'duration'] = df.loc[(i+1), 'end_time'] - df.loc[(i+1), 'start_time']
        df.loc[i, 'duration'] = df.loc[i, 'end_time'] - df.loc[i, 'start_time']
        i+=1
    else:
        i += 1
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

25249
4341168


In [37]:
df = df[df['duration'] > 0].reset_index().drop(columns=['index'])
df = df[df['num_steps'] > 0].reset_index().drop(columns=['index'])
print(len(df))
print(df.num_steps.sum())

25146
4341127


In [44]:
df[24900:24950]

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source,steps_per_hour
24900,2018-10-20,6.005278,2018-10-20,7.005278,20,1.0,Connected,20.0
24901,2018-10-20,7.005278,2018-10-20,8.005278,30,1.0,Connected,30.0
24902,2018-10-20,8.005278,2018-10-20,9.005278,250,1.0,Connected,250.0
24903,2018-10-20,9.005278,2018-10-20,9.138611,226,0.133333,Connected,1695.0
24904,2018-10-20,9.138611,2018-10-20,9.155278,80,0.016667,Connected,4800.0
24905,2018-10-20,9.155278,2018-10-20,9.238611,198,0.083333,Connected,2376.0
24906,2018-10-20,9.238611,2018-10-20,9.255278,74,0.016667,Connected,4440.0
24907,2018-10-20,9.255278,2018-10-20,9.288611,44,0.033333,Connected,1320.0
24908,2018-10-20,9.288611,2018-10-20,9.305278,94,0.016667,Connected,5640.0
24909,2018-10-20,9.305278,2018-10-20,10.005278,366,0.7,Connected,522.857143


In [26]:
# i = 0
# while i < len(df)-2:
#     if ((df.source[i] != df.source[i+1]) and (
#             df.start_date[i] == df.start_date[i+1])) and ((
#             df.start_time[i+1] == df.end_time[i]) and (
#             df.end_time[i] == df.start_time[i+2])):

#         df.drop(index=(i+1), inplace=True)
#         i += 2
#     else:
#         i += 1
# df.reset_index(inplace=True)
# df.drop(columns=['index'], inplace=True)
# print(len(df))
# print(df.num_steps.sum())

In [45]:
df.groupby(by=['start_date', 'source']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time,num_steps,duration,steps_per_hour
start_date,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-02-17,iPhone,176.142778,177.204167,812,1.061389,5784.295265
2017-02-18,iPhone,475.579722,478.716944,2668,3.137222,21503.906225
2017-02-19,iPhone,570.570556,574.096111,2991,3.525556,34185.015956
2017-02-20,iPhone,629.371389,633.515833,2812,4.144444,33756.375386
2017-02-21,iPhone,531.381667,535.065833,2819,3.684167,28411.874956
2017-02-22,iPhone,450.330278,453.641944,2559,3.311667,28819.028842
2017-02-23,iPhone,623.042778,628.317778,4396,5.275000,38945.083398
2017-02-24,iPhone,516.233333,520.597778,3060,4.364444,25794.488705
2017-02-25,iPhone,875.944722,858.391389,5448,6.446667,47151.327984
2017-02-26,iPhone,495.943056,499.325278,1583,3.382222,12358.233291
