# Steps Analysis (02/17/17 - 10/24/18)

In [1]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast
from math import ceil

In [2]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date',
                                 'num_steps', 'source'])
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [3]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [4]:
steps = soup.findAll('Record', {'type': 'HKQuantityTypeIdentifierStepCount'})
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [5]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [6]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [103]:
steps_df['num_steps'].sum()

4952676

In [7]:
sd_df = steps_df.start_date.str.split(' ', expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [8]:
ed_df = steps_df.end_date.str.split(' ', expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])

In [9]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [10]:
dur_df = pd.concat([sd_df, ed_df], axis=1)
dur_df['sdt'] = dur_df['start_date']+dur_df['start_time']
dur_df['edt'] = dur_df['end_date']+dur_df['end_time']
dur_df['duration'] = dur_df['edt']-dur_df['sdt']
dur_df = dur_df.drop(columns=['sdt', 'edt'])
dur_df.head()

Unnamed: 0,start_date,start_time,end_date,end_time,duration
0,2017-02-17,21:23:24,2017-02-17,21:30:42,00:07:18
1,2017-02-17,21:36:28,2017-02-17,21:44:19,00:07:51
2,2017-02-17,21:44:19,2017-02-17,21:51:52,00:07:33
3,2017-02-17,21:52:34,2017-02-17,21:58:37,00:06:03
4,2017-02-17,21:58:37,2017-02-17,22:07:35,00:08:58


In [29]:
df = pd.concat([steps_df, dur_df], axis=1)
df.columns

Index(['num_steps', 'source', 'start_date', 'start_time', 'end_date',
       'end_time', 'duration'],
      dtype='object')

In [30]:
df = df[['start_date', 'start_time', 'end_date',
         'end_time', 'num_steps', 'duration', 'source']]

In [31]:
df.sort_values(by=['start_date', 'start_time'], inplace=True)
df.tail(20)

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source
27715,2018-10-24,13:18:52,2018-10-24,13:23:19,59,00:04:27,iPhone
27744,2018-10-24,14:00:59,2018-10-24,15:00:59,46,01:00:00,Connected
27745,2018-10-24,15:00:59,2018-10-24,16:00:59,164,01:00:00,Connected
27746,2018-10-24,16:00:59,2018-10-24,17:00:59,304,01:00:00,Connected
27716,2018-10-24,16:06:19,2018-10-24,16:15:56,153,00:09:37,iPhone
27717,2018-10-24,16:21:58,2018-10-24,16:22:29,76,00:00:31,iPhone
27747,2018-10-24,17:00:59,2018-10-24,17:49:59,790,00:49:00,Connected
27718,2018-10-24,17:02:07,2018-10-24,17:11:02,105,00:08:55,iPhone
27719,2018-10-24,17:17:57,2018-10-24,17:27:57,218,00:10:00,iPhone
27720,2018-10-24,17:28:38,2018-10-24,17:38:37,381,00:09:59,iPhone


In [32]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000
df['duration'] = pd.to_numeric(df['duration'])/3600000000000

In [33]:
# for num in range(len(df)):
#     if df['end_date'][num] != df['start_date'][num]:
#         df['duration'][num] = df['end_time'][num]+24 - df['start_time'][num]
# df.head()

In [24]:
# for num in range(len(df)):
#     if df.start_date[num] != df.end_date[num]:
#         total_steps = df.num_steps[num]
#         steps_per_min = total_steps/df.duration[num]
#         time_left_in_start_date = 24-df.start_time[num]
#         steps_for_start_date = ceil(time_left_in_start_date*60*steps_per_min)
#         steps_for_end_date = total_steps-steps_for_start_date
#         df.num_steps[num] = steps_for_start_date
#         df.num_steps[num+1] += steps_for_end_date
# df[5050:5100]

In [97]:
i = 0
while i < len(df):
    if (i in df.index) and (i+1 in df.index):
        if df['source'][i] == df['source'][i+1]:
            if df['end_time'][i] == df['start_time'][i+1]:
                df.loc[i, 'end_time'] = df.loc[(i+1), 'end_time']
                df.loc[i, 'end_date'] = df.loc[(i+1), 'end_date']
                df.loc[i, 'num_steps'] += df.loc[(i+1), 'num_steps']
                df.loc[i, 'duration'] += df.loc[(i+1), 'duration']
                df.drop(index=(i+1), inplace=True)
                i += 2
            else:
                i += 1
        else:
            i += 1
    else:
        i += 1

In [98]:
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)

In [99]:
len(df)

13096

In [105]:
df.tail(60)

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source
13036,2018-10-23,18.001667,2018-10-23,18.151667,604,0.15,Connected
13037,2018-10-23,18.125556,2018-10-23,18.292778,751,0.167222,iPhone
13038,2018-10-23,18.151667,2018-10-23,18.318333,648,0.166667,Connected
13039,2018-10-23,18.292778,2018-10-23,18.451944,483,0.159167,iPhone
13040,2018-10-23,18.318333,2018-10-23,19.001667,714,0.683333,Connected
13041,2018-10-23,18.851111,2018-10-23,19.002778,164,0.151667,iPhone
13042,2018-10-23,19.001667,2018-10-23,20.001667,276,1.0,Connected
13043,2018-10-23,19.0625,2018-10-23,19.144167,62,0.081667,iPhone
13044,2018-10-23,19.241667,2018-10-23,19.533333,164,0.291667,iPhone
13045,2018-10-23,19.877778,2018-10-23,19.877778,4,0.0,iPhone


In [107]:
'num_steps', 'source', 'start_date', 'start_time', 'end_date',
       'end_time', 'duration'

IndentationError: unexpected indent (<ipython-input-107-d90e0401648c>, line 2)

In [129]:
i = 0
while i < len(df)-1:
    if df.source[i] != df.source[i+1]:
        if df.start_time[i] <= df.start_time[i+1]:
            if df.end_time[i] >= df.end_time[i+1]:
                df.drop(index=(i+1), inplace=True)
                i+=2
            else:
                i+=1
        else:
            i+=1
    else:
        i+=1

In [130]:
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
len(df)

12253

In [131]:
df.tail(60)

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,duration,source
12193,2018-10-22,21.381667,2018-10-22,21.543056,817,0.161389,iPhone
12194,2018-10-22,21.385,2018-10-22,22.001667,936,0.616667,Connected
12195,2018-10-22,22.001667,2018-10-22,23.001667,334,1.0,Connected
12196,2018-10-22,23.001667,2018-10-23,4.985,160,5.983333,Connected
12197,2018-10-23,5.001389,2018-10-23,8.001389,348,3.0,Connected
12198,2018-10-23,7.786667,2018-10-23,8.120278,1159,0.333611,iPhone
12199,2018-10-23,8.001389,2018-10-23,8.134722,378,0.133333,Connected
12200,2018-10-23,8.120278,2018-10-23,8.273056,529,0.152778,iPhone
12201,2018-10-23,8.134722,2018-10-23,8.718056,644,0.583333,Connected
12202,2018-10-23,8.718056,2018-10-23,9.151389,206,0.433333,Connected
