In [None]:
# Extract and display all unique data types to understand the structure
import xml.etree.ElementTree as ET

# Define the file path
input_file = "health-data.xml"

# Parse the XML file
tree = ET.parse(input_file)
root = tree.getroot()

# Extract all unique data types
unique_types = set()
for record in root.findall("Record"):
    record_type = record.attrib.get("type", "")
    unique_types.add(record_type)

# Print the unique types
print("Unique data types found in the XML file:")
for record_type in sorted(unique_types):
    print(record_type)


In [9]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import os

# Define file paths
input_file = "health-data.xml"
output_dir = "cleaned_grouped_health_data/"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define updated categories and their corresponding record types
categories_mapping = {
    "Headphone_Audio": [
        "HKCategoryTypeIdentifierHeadphoneAudioExposureEvent",
        "HKQuantityTypeIdentifierHeadphoneAudioExposure"
    ],
    "Walking_Metrics": [
        "HKQuantityTypeIdentifierAppleWalkingSteadiness",
        "HKQuantityTypeIdentifierWalkingAsymmetryPercentage",
        "HKQuantityTypeIdentifierWalkingDoubleSupportPercentage",
        "HKQuantityTypeIdentifierWalkingSpeed",
        "HKQuantityTypeIdentifierWalkingStepLength"
    ],
    "Physical_Activity": [
        "HKQuantityTypeIdentifierActiveEnergyBurned",
        "HKQuantityTypeIdentifierBasalEnergyBurned",
        "HKQuantityTypeIdentifierStepCount",
        "HKQuantityTypeIdentifierDistanceWalkingRunning",
        "HKQuantityTypeIdentifierFlightsClimbed"
    ],
    "Sleep_Metrics": [
        "HKCategoryTypeIdentifierSleepAnalysis",
        "HKDataTypeSleepDurationGoal"
    ],
    "Body_Stats": [
        "HKQuantityTypeIdentifierBodyMass",
        "HKQuantityTypeIdentifierHeight"
    ]
}

# Parse the XML file
tree = ET.parse(input_file)
root = tree.getroot()

# Extract records and group them by categories and subcategories
grouped_data = defaultdict(lambda: defaultdict(list))

for record in root.findall("Record"):
    record_type = record.attrib.get("type", "")
    for main_category, subcategories in categories_mapping.items():
        if record_type in subcategories:
            # Remove unwanted attributes
            for attr in ["type","sourceName", "sourceVersion", "device"]:
                record.attrib.pop(attr, None)
            grouped_data[main_category][record_type].append(record)

# Save each subcategory into its corresponding subdirectory
for main_category, subcategories in grouped_data.items():
    main_category_dir = os.path.join(output_dir, main_category)
    os.makedirs(main_category_dir, exist_ok=True)  # Create subdirectory for the main category
    
    for subcategory, records in subcategories.items():
        subcategory_file = os.path.join(main_category_dir, f"{subcategory}.xml")
        
        # Create a new XML tree for the subcategory
        subcategory_root = ET.Element("HealthData")
        for record in records:
            subcategory_root.append(record)
        
        # Write the subcategory-specific XML to a file
        subcategory_tree = ET.ElementTree(subcategory_root)
        subcategory_tree.write(subcategory_file, encoding="utf-8", xml_declaration=True)
        print(f"Saved cleaned subcategory: {subcategory} to {subcategory_file}")

# Output a summary of the directory structure
print("Data organized and cleaned into the following structure:")
for main_category, subcategories in grouped_data.items():
    print(f"- {main_category}:")
    for subcategory in subcategories.keys():
        print(f"  - {subcategory}")


Saved cleaned subcategory: HKQuantityTypeIdentifierHeight to cleaned_grouped_health_data/Body_Stats/HKQuantityTypeIdentifierHeight.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierBodyMass to cleaned_grouped_health_data/Body_Stats/HKQuantityTypeIdentifierBodyMass.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierStepCount to cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierStepCount.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierDistanceWalkingRunning to cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierDistanceWalkingRunning.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierBasalEnergyBurned to cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierBasalEnergyBurned.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierActiveEnergyBurned to cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierActiveEnergyBurned.xml
Saved cleaned subcategory: HKQuantityTypeIdentifierFlightsClimbed to clean