# Steps Analysis (02/17/17 - 10/24/18)

In [1]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil
from datetime import timedelta
import numpy as np

In [3]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [4]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [5]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [6]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [7]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [8]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [9]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [10]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [11]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date'] + dur_df['start_time']
dur_df['edt'] = dur_df['end_date'] + dur_df['end_time']
dur_df['duration'] = dur_df['edt'] - dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])

In [104]:
df = pd.concat([steps_df, dur_df], axis=1)

In [105]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [106]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)
print(len(df))
print(df.num_steps.sum())

27755
4952676


In [107]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000
df = df[df['num_steps']>0]
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

27660
4952676


In [108]:
i = 0
while i < len(df)-2:
    if (df.source[i] != df.source[i+1]) and (
            df.start_time[i] <= df.start_time[i+1]) and (
            df.end_time[i] >= df.end_time[i+1]) and (
            df.end_date[i] == df.end_date[i+1]):

        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

    else:
        i += 1

print(len(df))
print(df.num_steps.sum())

25197
4651794


In [109]:
i = 0
while i < len(df)-1:
    if (df.source[i] != df.source[i+1]) and (
            df.start_date[i] == df.end_date[i]) and (
            df.start_date[i] == df.end_date[i+1]) and (
            df.start_time[i+1] < df.end_time[i]) and (
            df.end_time[i+1] > df.end_time[i]):

        steps_per_hour = df.num_steps[i+1] * \
            (df.end_time[i+1] - df.start_time[i+1])

        steps_adjust = (df.end_time[i] - df.start_time[i+1]) * \
            steps_per_hour

        steps_adjust = ceil(steps_adjust)

        df.loc[(i+1), 'num_steps'] = steps_adjust
        df.loc[(i+1), 'start_time'] = df.end_time[i]
        df.loc[(i+1), 'duration'] = df.end_time[i+1] - df.start_time[i+1]
        df.loc[i, 'duration'] = df.end_time[i] - df.start_time[i]

        if df.num_steps[i+1] <= -1.0e-2:
            df.drop(index=(i+1), inplace=True)
            df.reset_index(inplace=True)
            df.drop(columns=['index'], inplace=True)

        else:
            i += 1

    else:
        i += 1

df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)

print(len(df))
print(df.num_steps.sum())

25197
3909848


In [110]:
df = df[df['duration']>0]
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(len(df))
print(df.num_steps.sum())

25194
3909807


In [111]:
df['duration'] = np.where(df['start_date'] == df['end_date'],
                          (df.loc[i, 'end_time'] - df.loc[i, 'start_time']),
                          df['duration'])

KeyError: 'the label [25196] is not in the [index]'

In [99]:
i = 0

while i < len(df)-1:
    if (df.start_date[i] == df.end_date[i]) and (
            df.start_date[i] == df.end_date[i+1]) and (
            df.end_time[i] == df.start_time[i+1]):

        df.loc[i, 'end_time'] = df.end_time[i+1]
        df.loc[i, 'num_steps'] += df.num_steps[i+1]

        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

    else:
        i += 1

print(len(df))
print(df.num_steps.sum())

10083
3909807


In [101]:
df.columns

Index(['start_date', 'start_time', 'end_date', 'end_time', 'num_steps',
       'duration', 'source'],
      dtype='object')

In [102]:
i = 0
while i < len(df)-1:
    if (df.end_date[i] - pd.Timedelta(86400000000000) == df.start_date[i]):

        st_dt, st_tm, end_dt, end_tm, num_st, dur, sauce = df.iloc[i]

        steps_per_hour = num_st * (end_tm + (24.0 - st_tm))

        df.loc[i+.1] = [end_dt, 0.0, end_dt, end_tm,
                        ceil(steps_per_hour*end_tm), end_tm, sauce]

        df.loc[i] = [st_dt, st_tm, st_dt, 24.0,
                     ceil(steps_per_hour*(24.0-st_tm)),
                     (24.0-st_tm), sauce]
        
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)
        
        i += 1
    else:
        i += 1

In [None]:
#         st_dt, st_tm, end_dt, end_tm, num_st, dur, sauce = df.iloc[i]

#         steps_per_hour = num_st * (end_tm + (24.0 - st_tm))

#         df.loc[i+.1] = [end_dt, 0.0, end_dt, end_tm,
#                         ceil(steps_per_hour*end_tm), end_tm, sauce]

#         df.loc[i] = [st_dt, st_tm, st_dt, 24.0,
#                      ceil(steps_per_hour*(24.0-st_tm)),
#                      (24.0-st_tm), sauce]

In [103]:
print(len(df))
print(df.num_steps.sum())

10220
26398723


In [None]:
df['steps_per_hour'] = df['num_steps']/df['duration']

In [None]:
df[df['start_date']=='2018-10-19']

In [None]:
summed_df = df.groupby(by=['start_date', 'end_date']).sum()

In [None]:
summed_df.tail(60)

In [None]:
summed_df[summed_df['duration']>=24]

In [None]:
# i = 0
# while i < len(df):
#     if (((i in df.index) and (i+1 in df.index)) and (
#             df['source'][i] == df['source'][i+1])) and (
#             df['end_time'][i] == df['start_time'][i+1]):
#         df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
#         df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
#         df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
#         df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
#         df.drop(index=(i+1), inplace=True)
#         df.reset_index(inplace=True)
#         df.drop(columns=['index'], inplace=True)
#     else:
#         i += 1

# print(len(df))
# print(df.num_steps.sum())

In [None]:
# df = df[df['duration'] > 0].reset_index().drop(columns=['index'])
# df = df[df['num_steps'] > 0].reset_index().drop(columns=['index'])
# print(len(df))
# print(df.num_steps.sum())

In [None]:
# i = 0
# while i < len(df)-2:
#     if ((df.source[i] != df.source[i+1]) and (
#             df.start_date[i] == df.start_date[i+1])) and ((
#             df.start_time[i+1] == df.end_time[i]) and (
#             df.end_time[i] == df.start_time[i+2])):

#         df.drop(index=(i+1), inplace=True)
#         i += 2
#     else:
#         i += 1
# df.reset_index(inplace=True)
# df.drop(columns=['index'], inplace=True)
# print(len(df))
# print(df.num_steps.sum())