In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

plt.style.use("fivethirtyeight")

# create element tree object
tree = ET.parse('data/carterhealthdata.xml')

# for every health data record, extract the attributes
root = tree.getroot()

record_list = [x.attrib for x in root.iter('Record')]

In [3]:
record_data = pd.DataFrame(record_list)
for col in ['creationDate', 'startDate', 'endDate']:
    record_data[col] = pd.to_datetime(record_data[col])

# value is numeric, NaN if fails
record_data['value'] = pd.to_numeric(record_data['value'], errors='coerce')

# some records do not measure anything, just count occurences
# filling with 1.0 (= one time) makes it easier to aggregate
record_data['value'] = record_data['value'].fillna(1.0)

# shorter observation names
record_data['type'] = record_data['type'].str.replace('HKQuantityTypeIdentifier', '')
record_data['type'] = record_data['type'].str.replace('HKCategoryTypeIdentifier', '')

record_data.tail()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
2584192,HeartRateVariabilitySDNN,Carter’s Apple Watch,11.1,ms,2024-10-14 23:41:23-07:00,2024-10-14 23:40:22-07:00,2024-10-14 23:41:22-07:00,43.1274,"<<HKDevice: 0x302fcbc50>, name:Apple Watch, ma..."
2584193,HeartRateVariabilitySDNN,Carter’s Apple Watch,11.1,ms,2024-10-15 03:20:06-07:00,2024-10-15 03:19:06-07:00,2024-10-15 03:20:05-07:00,17.9547,"<<HKDevice: 0x302fcbc50>, name:Apple Watch, ma..."
2584194,HeartRateVariabilitySDNN,Carter’s Apple Watch,11.1,ms,2024-10-15 07:20:09-07:00,2024-10-15 07:19:08-07:00,2024-10-15 07:20:07-07:00,29.6792,"<<HKDevice: 0x302fcbc50>, name:Apple Watch, ma..."
2584195,HeartRateVariabilitySDNN,Carter’s Apple Watch,11.1,ms,2024-10-15 11:23:25-07:00,2024-10-15 11:22:24-07:00,2024-10-15 11:23:24-07:00,49.3019,"<<HKDevice: 0x302fcbc50>, name:Apple Watch, ma..."
2584196,HeartRateVariabilitySDNN,Carter’s Apple Watch,11.1,ms,2024-10-15 15:24:30-07:00,2024-10-15 15:23:29-07:00,2024-10-15 15:24:29-07:00,33.6185,"<<HKDevice: 0x302fcbc50>, name:Apple Watch, ma..."


In [4]:
list_of_information = list(record_data['type'].unique())
list_of_information
record_data.shape[0]

2584197

In [5]:
record_data.to_csv('data/carter_health_data.csv', index=False)