In [1]:
#to export go into the health app, click on your profile icon, scroll to the bottom and click export all data
#it normally takes a couple minutes

In [2]:
import xml.etree.ElementTree as ET 

workingDataFile = "/home/chowder/Documents/workingData/apple/hr/appleWatchHRdf.parquet.gzip"

pathOfExport = "/home/chowder/Documents/dataExports/apple/"
individualExportPath = "/27-8-24/export/apple_health_export/"
xmlFileName = "export.xml"

tree = ET.parse(pathOfExport + individualExportPath + xmlFileName)
root = tree.getroot() 

rType = "HKQuantityTypeIdentifierHeartRate"
manufaturerField = "Apple Inc."
modelField = "Watch"


In [3]:
print(len(root.findall("./")))

1334964


In [4]:
import pandas as pd

numRecords = 0
listRecords = []

def getAppleWatchInfo(record):
    # check if the type of record is right
    if (record["type"] == "HKQuantityTypeIdentifierHeartRate"):

        # parse the device to a dictonary to make sure we have the right one
        # raw text of record["device"] should look like this 
        # '<<HKDevice: 0x999999999>, name:Apple Watch, manufacturer:' +
        # 'Apple Inc., model:Watch, hardware:Watch6,1, software:7.6>'
        device = {x.split(":")[0].strip() : x.split(":")[1] 
                  for x in record["device"].split(",")
                  if len(x.split(":")) == 2}

        if(device["manufacturer"] == "Apple Inc." and
           device["model"] == "Watch"):
           return device
    
    return None


# for every element tagged Record
for r in root.findall("./Record"):
    # the data is in the attributes 
    record = r.attrib
    device = getAppleWatchInfo(record)
    if device is not None:
        numRecords += 1
        if numRecords % 100_000 == 0: print(numRecords)
        
        row = [pd.to_datetime(record["startDate"]),
               device["hardware"], 
               device["software"][:-1], #removes a hanging '>' from the field
               pd.to_datetime(record["creationDate"]), 
               float(record["value"])]

        listRecords.append(row)


print(len(listRecords))
print(numRecords)

100000
200000
300000
371449
371449


In [5]:
listRecords = sorted(listRecords, key=lambda x:x[0])

In [6]:

ColumnNames = ["sampleDT", "hardware", "software", "creationDate", "value"]

appleWatchHRDf = pd.DataFrame(columns=ColumnNames, data=listRecords)

appleWatchHRDf = appleWatchHRDf.set_index("sampleDT")
appleWatchHRDf

Unnamed: 0_level_0,hardware,software,creationDate,value
sampleDT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-19 19:55:06-07:00,Watch6,7.6,2021-07-19 19:55:07-07:00,78.0
2021-07-19 19:55:11-07:00,Watch6,7.6,2021-07-19 19:55:12-07:00,79.0
2021-07-19 19:55:14-07:00,Watch6,7.6,2021-07-19 19:55:17-07:00,80.0
2021-07-19 19:55:21-07:00,Watch6,7.6,2021-07-19 19:55:22-07:00,82.0
2021-07-19 19:55:23-07:00,Watch6,7.6,2021-07-19 19:55:27-07:00,83.0
...,...,...,...,...
2024-08-27 18:45:33-07:00,Watch6,10.6.1,2024-08-27 18:45:37-07:00,79.0
2024-08-27 18:45:41-07:00,Watch6,10.6.1,2024-08-27 18:45:42-07:00,76.0
2024-08-27 18:45:45-07:00,Watch6,10.6.1,2024-08-27 18:45:47-07:00,76.0
2024-08-27 18:45:48-07:00,Watch6,10.6.1,2024-08-27 18:45:52-07:00,77.0


In [7]:
appleWatchHRDf.dtypes

hardware                                        object
software                                        object
creationDate    datetime64[ns, pytz.FixedOffset(-420)]
value                                          float64
dtype: object

In [8]:
appleWatchHRDf.to_parquet(workingDataFile,
              compression='gzip') 