# Steps Analysis (02/17/17 - 10/24/18)

In [1]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil
from datetime import timedelta
import numpy as np

In [2]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [3]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [4]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [5]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [6]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [7]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [8]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [9]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [10]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date'] + dur_df['start_time']
dur_df['edt'] = dur_df['end_date'] + dur_df['end_time']
dur_df['duration'] = dur_df['edt'] - dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])

In [35]:
df = pd.concat([steps_df, dur_df], axis=1)

In [36]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [37]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)
print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 27755
Total Steps = 4952676
start_date != end_date: 144


In [38]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000

In [39]:
df = df[df['num_steps'] > 0]
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 27660
Total Steps = 4952676
start_date != end_date: 139


In [40]:
i = 0
while i < len(df)-1:
    if (df.start_time[i] <= df.start_time[i+1]) and (
            df.end_time[i] >= df.end_time[i+1]) and (
            df.start_date[i] == df.end_date[i+1]):

        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

    else:
        i += 1

print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 25171
Total Steps = 4647445
start_date != end_date: 139


In [41]:
i = 0
while i < len(df)-1:
    if (df.start_date[i] == df.end_date[i]) and (
            df.start_date[i] == df.end_date[i+1]) and (
            df.start_time[i+1] < df.end_time[i]) and (
            df.end_time[i+1] > df.end_time[i]):

        steps_per_hour = df.num_steps[i+1] / (df.end_time[i+1] - df.start_time[i+1])

        steps_adjust = (df.end_time[i] - df.start_time[i+1]) * steps_per_hour

        steps_adjust = round(steps_adjust)

        df.loc[(i+1), 'num_steps'] = steps_adjust
        df.loc[(i+1), 'start_time'] = df.end_time[i]
        df.loc[(i+1), 'duration'] = df.end_time[i+1] - df.start_time[i+1]
        df.loc[i, 'duration'] = df.end_time[i] - df.start_time[i]

        if df.num_steps[i+1] <= -1.0e-2:
            df.drop(index=(i+1), inplace=True)
            df.reset_index(inplace=True)
            df.drop(columns=['index'], inplace=True)

        else:
            i += 1

    else:
        i += 1

df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)

print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 25171
Total Steps = 4159537.0
start_date != end_date: 139


In [42]:
df['duration'] = np.where(df['start_date'] == df['end_date'],
                          (df['end_time'] - df['start_time']),
                          df['duration'])

In [43]:
df = df[df['duration'] >= 1.0e-4]
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 25170
Total Steps = 4159521.0
start_date != end_date: 139


In [46]:
i = 0

while i < len(df)-1:
    if (df.start_date[i] == df.end_date[i]) and (
            df.start_date[i] == df.end_date[i+1]) and (
            (df.end_time[i] - df.start_time[i+1]) <= 1.0e-5):

        df.loc[i, 'end_time'] = df.end_time[i+1]
        df.loc[i, 'num_steps'] = df.num_steps[i] + df.num_steps[i+1]

        df.drop(index=(i+1), inplace=True)
        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

    else:
        i += 1

print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 659
Total Steps = 4159521.0
start_date != end_date: 0


In [None]:
df.columns

In [None]:
i = 0
while i < len(df)-1:
    if (df.end_date[i] - pd.Timedelta(86400000000000) == df.start_date[i]) and (
            df.duration[i] <= 1.0):

        st_dt, st_tm, end_dt, end_tm, num_st, dur, sauce = df.iloc[i]

        steps_per_hour = num_st / (end_tm + (24.0 - st_tm))

        dur_1 = round(steps_per_hour * (24.0 - st_tm))
        dur_2 = round(steps_per_hour * end_tm)

        df.loc[i+.1] = [end_dt, 0.0, end_dt, end_tm,
                        dur_1, end_tm, sauce]

        df.loc[i] = [st_dt, st_tm, st_dt, 24.0,
                     dur_2, (24.0 - st_tm), sauce]

        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

        i += 1
    else:
        i += 1

df.sort_values(by=['start_date', 'start_time'], inplace=True)
print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

In [47]:
i = 0
while i < len(df)-1:
    if (df.end_date[i] - pd.Timedelta(1, unit='D') == df.start_date[i]):

        st_dt, st_tm, end_dt, end_tm, num_st, dur, sauce = df.iloc[i]

        steps_per_hour = num_st / (end_tm + (24.0 - st_tm))

        dur_1 = round(steps_per_hour * (24.0 - st_tm))
        dur_2 = round(steps_per_hour * end_tm)

        df.loc[i+.1] = [end_dt, 0.0, end_dt, end_tm,
                        dur_1, end_tm, sauce]

        df.loc[i] = [st_dt, st_tm, st_dt, 24.0,
                     dur_2, (24.0 - st_tm), sauce]

        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

        i += 1
        
    elif (df.end_date[i] - pd.Timedelta(2, unit='D') == df.start_date[i]):

        st_dt, st_tm, end_dt, end_tm, num_st, dur, sauce = df.iloc[i]

        steps_per_hour = num_st / (end_tm + 24.0 + (24.0 - st_tm))

        dur_1 = round(steps_per_hour * (24.0 - st_tm))
        dur_2 = round(steps_per_hour * 24.0)
        dur_3 = round(steps_per_hour * end_tm)
        
        dt_2 = end_dt - pd.Timedelta(1, unit='D')

        df.loc[i+.1] = [dt_2, 0.0, dt_2, 24.0,
                        dur_2, 24.0, sauce]
        
        df.loc[i+.2] = [end_dt, 0.0, end_dt, end_tm,
                        dur_3, end_tm, sauce]

        df.loc[i] = [st_dt, st_tm, st_dt, 24.0,
                     dur_1, (24.0 - st_tm), sauce]

        df.reset_index(inplace=True)
        df.drop(columns=['index'], inplace=True)

        i += 1
    else:
        i += 1

df.sort_values(by=['start_date', 'start_time'], inplace=True)

df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
print(f'Total Rows = {len(df)}')
print(f'Total Steps = {df.num_steps.sum()}')
print(f"start_date != end_date: {len(df[df['start_date']!=df['end_date']])}")

Total Rows = 659
Total Steps = 4159521.0
start_date != end_date: 0


In [51]:
659-df.start_date.nunique()

45

In [None]:
df['steps_per_hour'] = df['num_steps']/df['duration']

In [None]:
df[df['start_date'] == '2018-10-19']

In [None]:
summed_df = df.groupby(by=['start_date', 'end_date']).sum()

In [None]:
summed_df.tail(60)

In [None]:
summed_df[summed_df['duration'] >= 24]