# Steps Analysis (02/17/17 - 10/24/18)

In [280]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil

In [281]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [282]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [283]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [284]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [285]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [286]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [287]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [288]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [289]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date'] + dur_df['start_time']
dur_df['edt'] = dur_df['end_date'] + dur_df['end_time']
dur_df['duration'] = dur_df['edt'] - dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])

In [290]:
df = pd.concat([steps_df, dur_df], axis=1)

In [291]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [292]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)

In [293]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000
print(len(df))
print(df.num_steps.sum())

27755
4952676


In [327]:
i = 0
while i < len(df):
    if (((i in df.index) and (i+1 in df.index)) and (
            df['source'][i] == df['source'][i+1])) and (
            df['end_time'][i] == df['start_time'][i+1]):
        df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
        df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
        df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
        df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
        df.drop(index=(i+1), inplace=True)
        i += 2
    else:
        i += 1
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

11853
4781548


In [328]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
            df.start_time[i] <= df.start_time[i+1])) and (
            df.end_time[i] >= df.end_time[i+1]):
        df.drop(index=(i+1), inplace=True)
        i += 2
    else:
        i += 1
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

11840
4765034


In [330]:
df['steps_per_hour'] = df['num_steps']/df['duration']

In [333]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
            df.start_date[i] == df.start_date[i+1])) and (
            df.start_time[i+1] <= df.end_time[i]):

        steps_adjust = ceil((df.end_time[i] - df.start_time[i+1]
                             ) * df.steps_per_hour[i+1])

        df.loc[(i+1), 'num_steps'] -= steps_adjust
        df.loc[(i+1), 'start_time'] = df.loc[i, 'end_time']
        df.loc[(i+1), 'duration'] = df.loc[(i+1), 'end_time'] - df.loc[(i+1), 'start_time']
        df.loc[i, 'duration'] = df.loc[i, 'end_time'] - df.loc[i, 'start_time']
        i += 1
    else:
        i += 1
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

11817
4323399


In [334]:
df = df[df['duration'] > 0].reset_index().drop(columns=['index'])
df = df[df['num_steps'] > 0].reset_index().drop(columns=['index'])
print(len(df))
print(df.num_steps.sum())

11817
4323399


In [279]:
# i = 0
# while i < len(df)-2:
#     if ((df.source[i] != df.source[i+1]) and (
#             df.start_date[i] == df.start_date[i+1])) and ((
#             df.start_time[i+1] == df.end_time[i]) and (
#             df.end_time[i] == df.start_time[i+2])):

#         df.drop(index=(i+1), inplace=True)
#         i += 2
#     else:
#         i += 1
# df.reset_index(inplace=True)
# df.drop(columns=['index'], inplace=True)
# print(len(df))
# print(df.num_steps.sum())

In [340]:
df.groupby(by=['start_date', 'end_date', 'source']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,start_time,end_time,num_steps,duration,steps_per_hour
start_date,end_date,source,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-02-17,2017-02-17,iPhone,22.854722,22.986389,295,0.301111,979.704797
2017-02-18,2017-02-18,iPhone,23.703056,23.823889,1472,1.001389,1613.888889
2017-02-19,2017-02-19,iPhone,23.186944,23.314722,364,0.339722,1492.872570
2017-02-20,2017-02-20,iPhone,23.663333,23.909167,302,0.306944,4153.846154
2017-02-21,2017-02-21,iPhone,22.403333,23.323056,546,0.919722,2304.337632
2017-02-22,2017-02-22,iPhone,23.723333,23.829722,335,0.298889,2758.064516
2017-02-23,2017-02-23,iPhone,21.528889,21.628889,705,0.524722,2543.086172
2017-02-24,2017-02-24,iPhone,22.982222,23.126667,981,0.806944,1384.615385
2017-02-25,2017-02-25,iPhone,22.126667,22.239444,1639,0.923056,1775.624436
2017-02-25,2017-02-26,iPhone,23.905278,0.046944,28,0.141667,197.647059
