In [1]:
#to export go into the health app, click on your profile icon, scroll to the bottom and click export all data
#it normally takes a couple minutes

In [2]:
import os
import sys
def getRepoPath():
    cwd = os.getcwd()
    delimiter = "\\" if "\\" in cwd else "/"
    repoPath = delimiter.join(cwd.split(delimiter)[:cwd.split(delimiter).index("dataImport")+1]) + delimiter
    return repoPath
repoPath = getRepoPath()
sys.path.append(repoPath)
from utils import exportsDataPath, writeWorkingHRDfParquet

pathOfExport = exportsDataPath + "apple/"
individualExportPath = "/17-9-24/export/apple_health_export/"
xmlFileName = "export.xml"


In [3]:
#this took 22s with 568k samples
import xml.etree.ElementTree as ET 

tree = ET.parse(pathOfExport + individualExportPath + xmlFileName)
root = tree.getroot() 

rType = "HKQuantityTypeIdentifierHeartRate"
manufaturerField = "Apple Inc."
modelField = "Watch"
# print(len(root.findall("./")))

In [4]:
# took 1m 20s to run with 568k samples
import pandas as pd

numRecords = 0
listRecords = []

def getAppleWatchInfo(record):
    # check if the type of record is right
    if (record["type"] == "HKQuantityTypeIdentifierHeartRate"):

        # parse the device to a dictonary to make sure we have the right one
        # raw text of record["device"] should look like this 
        # '<<HKDevice: 0x999999999>, name:Apple Watch, manufacturer:' +
        # 'Apple Inc., model:Watch, hardware:Watch6,1, software:7.6>'
        device = {x.split(":")[0].strip() : x.split(":")[1] 
                  for x in record["device"].split(",")
                  if len(x.split(":")) == 2}

        if(device["manufacturer"] == "Apple Inc." and
           device["model"] == "Watch"):
           return device
    
    return None


# for every element tagged Record
for r in root.findall("./Record"):
    # the data is in the attributes 
    record = r.attrib
    device = getAppleWatchInfo(record)
    if device is not None:
        numRecords += 1
        if numRecords % 100_000 == 0: print(numRecords)
        
        row = [pd.to_datetime(record["startDate"]),
               device["hardware"], 
               device["software"][:-1], #removes a hanging '>' from the field
               pd.to_datetime(record["creationDate"]), 
               float(record["value"])]

        listRecords.append(row)


print(len(listRecords))
print(numRecords)

100000
200000
300000
400000
500000
568008
568008


In [5]:
listRecords = sorted(listRecords, key=lambda x:x[0])

In [6]:

ColumnNames = ["sampleDT", "hardware", "software", "creationDate", "value"]

appleWatchHRDf = pd.DataFrame(columns=ColumnNames, data=listRecords)

appleWatchHRDf = appleWatchHRDf.set_index("sampleDT")
appleWatchHRDf

Unnamed: 0_level_0,hardware,software,creationDate,value
sampleDT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-19 19:55:06-07:00,Watch6,7.6,2021-07-19 19:55:07-07:00,78.0
2021-07-19 19:55:11-07:00,Watch6,7.6,2021-07-19 19:55:12-07:00,79.0
2021-07-19 19:55:14-07:00,Watch6,7.6,2021-07-19 19:55:17-07:00,80.0
2021-07-19 19:55:21-07:00,Watch6,7.6,2021-07-19 19:55:22-07:00,82.0
2021-07-19 19:55:23-07:00,Watch6,7.6,2021-07-19 19:55:27-07:00,83.0
...,...,...,...,...
2024-09-17 12:53:00-07:00,Watch6,10.6.1,2024-09-17 12:53:06-07:00,70.0
2024-09-17 12:53:07-07:00,Watch6,10.6.1,2024-09-17 12:53:11-07:00,70.0
2024-09-17 12:53:14-07:00,Watch6,10.6.1,2024-09-17 12:53:16-07:00,72.0
2024-09-17 12:53:17-07:00,Watch6,10.6.1,2024-09-17 12:53:21-07:00,73.0


In [7]:
appleWatchHRDf.dtypes

hardware                                        object
software                                        object
creationDate    datetime64[ns, pytz.FixedOffset(-420)]
value                                          float64
dtype: object

In [8]:
writeWorkingHRDfParquet('apple', appleWatchHRDf)

the file size of all the data is about 6 MB
the total number of rows in the file is 568008
splitting into 2 files of about 5MB files with 284004 rows per file
saving rows 0 to 284003
hardware                           Watch6
software                              7.6
creationDate    2021-07-19 19:55:07-07:00
value                                  78
Name: 2021-07-19 19:55:06-07:00, dtype: object
to a file named 2021-07-19T195506-0700_2023-11-28T091908-0700.parquet.gzip
2021-07-19 19:55:06-07:00
saving rows 284004 to 568007
hardware                           Watch6
software                            9.6.3
creationDate    2023-11-28 09:26:35-07:00
value                                  63
Name: 2023-11-28 09:21:28-07:00, dtype: object
to a file named 2023-11-28T092128-0700_2024-09-17T125324-0700.parquet.gzip
2023-11-28 09:21:28-07:00
