# Steps Analysis (02/17/17 - 10/24/18)

In [1]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil

In [2]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [3]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [4]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [5]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [6]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [7]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [8]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [9]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [10]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date'] + dur_df['start_time']
dur_df['edt'] = dur_df['end_date'] + dur_df['end_time']
dur_df['duration'] = dur_df['edt'] - dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])

In [11]:
df = pd.concat([steps_df, dur_df], axis=1)

In [12]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [13]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)

In [14]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000
print(len(df))
print(df.num_steps.sum())

27755
4952676


In [25]:
i = 0
while i < len(df)-1:
    if (((i in df.index) and (i+1 in df.index)) and (
        df['source'][i] == df['source'][i+1])) and ((
            df['end_time'][i] == df['start_time'][i+1]) and (
            df.start_date[i] == df.end_date[i])):

        df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
        df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
        df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
        df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)
    else:
        i += 1
        
print(len(df))
print(df.num_steps.sum())

11979
4362687


In [32]:
# i = 0
# while i < len(df):
#     if (((i in df.index) and (i+1 in df.index)) and (
#             df['source'][i] == df['source'][i+1])) and (
#             df['end_time'][i] == df['start_time'][i+1]):
#         df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
#         df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
#         df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
#         df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
#         df.drop(index=(i+1), inplace=True)
#         df.reset_index(inplace=True)
#         df.drop(columns=['index'], inplace=True)
#     else:
#         i += 1

# print(len(df))
# print(df.num_steps.sum())

In [26]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
        df.start_time[i] <= df.start_time[i+1])) and ((
            df.end_time[i] >= df.end_time[i+1]) and (
            df.end_date[i] == df.end_date[i+1])):

        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)
    else:
        i += 1

print(len(df))
print(df.num_steps.sum())

11979
4362687


In [27]:
df['steps_per_hour'] = df['num_steps']/df['duration']

In [28]:
i = 0
while i < len(df)-1:
    if ((df.source[i] != df.source[i+1]) and (
        df.start_date[i] == df.start_date[i+1])) and ((
            df.start_time[i+1] <= df.end_time[i]) and (
            df.end_date[i] == df.end_date[i+1])):

        steps_adjust = ceil((df.end_time[i] - df.start_time[i+1]
                             ) * df.steps_per_hour[i+1])

        df.loc[(i+1), 'num_steps'] -= steps_adjust
        df.loc[(i+1), 'start_time'] = df.loc[i, 'end_time']
        df.loc[(i+1), 'duration'] = df.loc[(i+1), 'end_time'] - \
            df.loc[(i+1), 'start_time']
        df.loc[i, 'duration'] = df.loc[i, 'end_time'] - df.loc[i, 'start_time']
        i += 1
    else:
        i += 1
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

11979
4362687


In [29]:
df = df[df['duration'] > 0].reset_index().drop(columns=['index'])
df = df[df['num_steps'] > 0].reset_index().drop(columns=['index'])
print(len(df))
print(df.num_steps.sum())

11979
4362687


In [31]:
df.tail(60)

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source,steps_per_hour
11919,2018-10-22,17.835833,2018-10-22,17.951667,309,0.115833,Connected,2667.625899
11920,2018-10-22,17.951667,2018-10-22,18.004167,258,0.0525,iPhone,4914.285714
11921,2018-10-22,18.004167,2018-10-22,18.068333,144,0.064167,Connected,2244.155844
11922,2018-10-22,18.068333,2018-10-22,18.106389,160,0.038056,iPhone,4204.379562
11923,2018-10-22,18.106389,2018-10-22,18.235,135,0.128611,Connected,1049.676026
11924,2018-10-22,18.235,2018-10-22,18.376389,678,0.141389,iPhone,4795.284872
11925,2018-10-22,18.376389,2018-10-22,21.168333,1051,2.791944,Connected,376.440155
11926,2018-10-22,21.168333,2018-10-22,21.209167,134,0.040833,iPhone,3281.632653
11927,2018-10-22,21.209167,2018-10-22,21.335,278,0.125833,Connected,2209.271523
11928,2018-10-22,21.335,2018-10-22,21.381667,242,0.046667,iPhone,5185.714286


In [26]:
# i = 0
# while i < len(df)-2:
#     if ((df.source[i] != df.source[i+1]) and (
#             df.start_date[i] == df.start_date[i+1])) and ((
#             df.start_time[i+1] == df.end_time[i]) and (
#             df.end_time[i] == df.start_time[i+2])):

#         df.drop(index=(i+1), inplace=True)
#         i += 2
#     else:
#         i += 1
# df.reset_index(inplace=True)
# df.drop(columns=['index'], inplace=True)
# print(len(df))
# print(df.num_steps.sum())

In [32]:
df.groupby(by=['start_date', 'source']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,start_time,end_time,num_steps,duration,steps_per_hour
start_date,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-02-17,iPhone,110.002222,111.063611,812,1.061389,3290.095197
2017-02-18,iPhone,232.108889,235.246111,2668,3.137222,6429.481186
2017-02-19,iPhone,363.651944,367.177500,2991,3.525556,16133.542046
2017-02-20,iPhone,406.206389,410.350833,2812,4.144444,19327.603802
2017-02-21,iPhone,248.269722,251.953889,2819,3.684167,10783.030254
2017-02-22,iPhone,335.235000,338.546667,2559,3.311667,16242.404669
2017-02-23,iPhone,307.116944,312.391944,4396,5.275000,15115.087407
2017-02-24,iPhone,305.357500,309.721944,3060,4.364444,11851.974289
2017-02-25,iPhone,387.877778,370.324444,5448,6.446667,12087.332980
2017-02-26,iPhone,298.405556,301.787778,1583,3.382222,5858.957948
