# Steps Analysis (02/17/17 - 10/24/18)

In [18]:
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import ast

In [19]:
steps_df = pd.DataFrame(columns=['start_date', 'end_date', 'num_steps', 'source'])

In [20]:
with open('./data/export.xml') as fp:
    soup = BeautifulSoup(fp, 'lxml-xml')

In [21]:
steps = soup.findAll('Record', {'type':'HKQuantityTypeIdentifierStepCount'})

In [22]:
start_date_col = []
end_date_col = []
num_steps_col = []
source_col = []

In [23]:
for num in steps:
    start_date_col.append(num['startDate'])
    end_date_col.append(num['endDate'])
    num_steps_col.append(num['value'])
    source_col.append(num['sourceName'])

In [24]:
steps_df['start_date'] = start_date_col
steps_df['end_date'] = end_date_col
steps_df['num_steps'] = num_steps_col
steps_df['source'] = source_col

In [25]:
steps_df['num_steps'] = steps_df.num_steps.astype(int)

In [26]:
steps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27755 entries, 0 to 27754
Data columns (total 4 columns):
start_date    27755 non-null object
end_date      27755 non-null object
num_steps     27755 non-null int64
source        27755 non-null object
dtypes: int64(1), object(3)
memory usage: 867.4+ KB


In [27]:
steps_df.head(20)

Unnamed: 0,start_date,end_date,num_steps,source
0,2017-02-17 21:23:24 -0700,2017-02-17 21:30:42 -0700,58,iPhone
1,2017-02-17 21:36:28 -0700,2017-02-17 21:44:19 -0700,176,iPhone
2,2017-02-17 21:44:19 -0700,2017-02-17 21:51:52 -0700,44,iPhone
3,2017-02-17 21:52:34 -0700,2017-02-17 21:58:37 -0700,18,iPhone
4,2017-02-17 21:58:37 -0700,2017-02-17 22:07:35 -0700,215,iPhone
5,2017-02-17 22:16:25 -0700,2017-02-17 22:25:30 -0700,277,iPhone
6,2017-02-17 22:25:30 -0700,2017-02-17 22:34:29 -0700,18,iPhone
7,2017-02-17 22:51:17 -0700,2017-02-17 22:59:11 -0700,6,iPhone
8,2017-02-18 09:55:09 -0700,2017-02-18 10:01:14 -0700,19,iPhone
9,2017-02-18 11:33:45 -0700,2017-02-18 11:40:05 -0700,16,iPhone


In [28]:
sd_df = steps_df.start_date.str.split(' ',expand=True)
sd_df.columns = ['start_date', 'start_time', 'time_zone']
sd_df = sd_df.drop(columns=['time_zone'])
sd_df['start_date'] = pd.to_datetime(sd_df['start_date'], format='%Y/%m/%d')
sd_df['start_time'] = pd.to_timedelta(sd_df['start_time'])

In [29]:
sd_df.head()

Unnamed: 0,start_date,start_time
0,2017-02-17,21:23:24
1,2017-02-17,21:36:28
2,2017-02-17,21:44:19
3,2017-02-17,21:52:34
4,2017-02-17,21:58:37


In [30]:
ed_df = steps_df.end_date.str.split(' ',expand=True)
ed_df.columns = ['end_date', 'end_time', 'time_zone']
ed_df = ed_df.drop(columns=['time_zone'])
ed_df['end_date'] = pd.to_datetime(ed_df['end_date'], format='%Y/%m/%d')
ed_df['end_time'] = pd.to_timedelta(ed_df['end_time'])
ed_df.head()

Unnamed: 0,end_date,end_time
0,2017-02-17,21:30:42
1,2017-02-17,21:44:19
2,2017-02-17,21:51:52
3,2017-02-17,21:58:37
4,2017-02-17,22:07:35


In [31]:
steps_df = steps_df.drop(columns=['start_date', 'end_date'])

In [37]:
df = pd.concat([steps_df, sd_df, ed_df], axis=1)
df.head()

Unnamed: 0,num_steps,source,start_date,start_time,end_date,end_time
0,58,iPhone,2017-02-17,21:23:24,2017-02-17,21:30:42
1,176,iPhone,2017-02-17,21:36:28,2017-02-17,21:44:19
2,44,iPhone,2017-02-17,21:44:19,2017-02-17,21:51:52
3,18,iPhone,2017-02-17,21:52:34,2017-02-17,21:58:37
4,215,iPhone,2017-02-17,21:58:37,2017-02-17,22:07:35


In [38]:
df = df[['start_date', 'start_time', 'end_date', 'end_time', 'num_steps', 'source']]

In [42]:
df['range'] = df['end_time'] - df['start_time']

In [56]:
df['range'] = pd.to_numeric(df['range'])/60000000000

In [97]:
for num in range(len(df)):
    if df.range[num] <0.00:
        df.range[num] = df.range[num]+1440

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [99]:
df.range.max()

1412.4

In [80]:
df['start_time'] = pd.to_numeric(df['start_time'])/3600000000000
df['end_time'] = pd.to_numeric(df['end_time'])/3600000000000

In [95]:
df[5050:5100]

Unnamed: 0,start_date,start_time,end_date,end_time,num_steps,source,range
5050,2017-07-07,23.194167,2017-07-07,23.347778,245,iPhone,9.216667
5051,2017-07-07,23.347778,2017-07-07,23.512222,110,iPhone,9.866667
5052,2017-07-07,23.512222,2017-07-07,23.678611,172,iPhone,9.983333
5053,2017-07-07,23.678611,2017-07-07,23.840833,88,iPhone,9.733333
5054,2017-07-07,23.840833,2017-07-08,0.003889,53,iPhone,-1430.216667
5055,2017-07-08,0.003889,2017-07-08,0.170278,141,iPhone,9.983333
5056,2017-07-08,0.170278,2017-07-08,0.276389,80,iPhone,6.366667
5057,2017-07-08,0.276389,2017-07-08,0.422222,109,iPhone,8.75
5058,2017-07-08,0.422222,2017-07-08,0.583056,42,iPhone,9.65
5059,2017-07-08,0.583056,2017-07-08,0.653611,39,iPhone,4.233333
