In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import json

# Function to process XML, calculate daily averages, and split into high/low exposure JSON
def process_and_save_json(file_path, threshold, last_x_days):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data
    data = []
    for record in root.findall('Record'):
        creation_date = record.get('creationDate')
        value = float(record.get('value', 0))
        date_only = datetime.strptime(creation_date.split()[0], "%Y-%m-%d").date()
        data.append((date_only, value))

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Value'])

    # Calculate daily averages
    daily_averages = df.groupby('Date')['Value'].mean().reset_index()
    daily_averages.columns = ['Date', 'DailyAverage']

    # Convert Date column to string for JSON serialization
    daily_averages['Date'] = daily_averages['Date'].astype(str)

    # Split into high and low exposure based on the threshold
    daily_averages = daily_averages[daily_averages['DailyAverage'] > 20]
    low_records = daily_averages[daily_averages['DailyAverage'] <= threshold].to_dict(orient='records')
    high_records = daily_averages[daily_averages['DailyAverage'] > threshold].to_dict(orient='records')


    with open(low_json_path, 'w', encoding='utf-8') as low_file:
        json.dump(low_records[-last_x_days::], low_file, indent=4)

    with open(high_json_path, 'w', encoding='utf-8') as high_file:
        json.dump(high_records[-last_x_days::], high_file, indent=4)

    return low_json_path, high_json_path

# Define the XML file path and threshold
threshold = 80.0  # User-defined threshold
last_x_days = 30

file_path = 'cleaned_grouped_health_data/Headphone_Audio/HKQuantityTypeIdentifierHeadphoneAudioExposure.xml'

low_json_path = 'LowExposureDailyAverages.json'   
high_json_path = 'HighExposureDailyAverages.json'

# Process the file and save to JSON
low_json_path, high_json_path = process_and_save_json(file_path, threshold, last_x_days)

print("Low Exposure JSON File:", low_json_path)
print("High Exposure JSON File:", high_json_path)


Low Exposure JSON File: LowExposureDailyAverages.json
High Exposure JSON File: HighExposureDailyAverages.json


In [2]:
# Define file paths
input_file_path = 'cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierActiveEnergyBurned.xml'
output_json_path = 'ActiveEnergyBurnedDaily.json'

In [3]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import json

# Function to process active energy burned data and save daily averages to JSON
def process_active_energy_burned(file_path, output_json_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data
    data = []
    for record in root.findall('Record'):
        creation_date = record.get('creationDate')
        value = float(record.get('value', 0))
        date_only = datetime.strptime(creation_date.split()[0], "%Y-%m-%d").date()
        data.append((date_only, value))

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Value'])

    # Calculate daily averages
    daily_averages = df.groupby('Date')['Value'].sum().reset_index()
    daily_averages.columns = ['Date', 'DailyAverage']

    # Convert Date column to string for JSON serialization
    daily_averages['Date'] = daily_averages['Date'].astype(str)

    # Convert to a list of dictionaries
    records = daily_averages.to_dict(orient='records')

    # Save results to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(records, json_file, indent=4)

    return output_json_path

# Process the file and save to JSON
result_json_path = process_active_energy_burned(input_file_path, output_json_path)

print("Active Energy Burned JSON File:", result_json_path)


Active Energy Burned JSON File: ActiveEnergyBurnedDaily.json


In [4]:
input_file_path = 'cleaned_grouped_health_data/Walking_Metrics/HKQuantityTypeIdentifierWalkingSpeed.xml'
output_json_path = 'WalkingSpeedDailyAverages.json'

In [5]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import json

# Function to process walking speed data and calculate weighted daily averages
def process_weighted_walking_speed(file_path, output_json_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data
    data = []
    for record in root.findall('Record'):
        start_date = datetime.strptime(record.get('startDate'), "%Y-%m-%d %H:%M:%S %z")
        end_date = datetime.strptime(record.get('endDate'), "%Y-%m-%d %H:%M:%S %z")
        speed = float(record.get('value', 0))
        duration = (end_date - start_date).total_seconds()  # Duration in seconds
        distance = speed * duration  # Distance = Speed * Time
        data.append((start_date.date(), distance, duration))

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Distance', 'Duration'])

    # Calculate total distance and total duration for each day
    daily_totals = df.groupby('Date').sum().reset_index()

    # Calculate the weighted average speed for each day
    daily_totals['DailyWeightedAverageSpeed'] = daily_totals['Distance'] / daily_totals['Duration']

    # Convert Date column to string for JSON serialization
    daily_totals['Date'] = daily_totals['Date'].astype(str)

    # Convert to a list of dictionaries
    records = daily_totals[['Date', 'DailyWeightedAverageSpeed']].to_dict(orient='records')

    # Save results to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(records, json_file, indent=4)

    return output_json_path



# Process the file and save to JSON
result_json_path = process_weighted_walking_speed(input_file_path, output_json_path)

print("Walking Speed Weighted Average JSON File:", result_json_path)


Walking Speed Weighted Average JSON File: WalkingSpeedDailyAverages.json


In [6]:
# Define file paths
input_file_path = 'cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierStepCount.xml'
output_json_path = 'DailyStepCount.json'

In [7]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import json

# Function to process step count data and save daily totals to JSON
def process_daily_step_count(file_path, output_json_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data
    data = []
    for record in root.findall('Record'):
        start_date = datetime.strptime(record.get('startDate'), "%Y-%m-%d %H:%M:%S %z")
        step_count = int(record.get('value', 0))
        data.append((start_date.date(), step_count))

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'StepCount'])

    # Calculate total step count for each day
    daily_totals = df.groupby('Date')['StepCount'].sum().reset_index()
    daily_totals.columns = ['Date', 'TotalSteps']

    # Convert Date column to string for JSON serialization
    daily_totals['Date'] = daily_totals['Date'].astype(str)

    # Convert to a list of dictionaries
    records = daily_totals.to_dict(orient='records')

    # Save results to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(records, json_file, indent=4)

    return output_json_path



# Process the file and save to JSON
result_json_path = process_daily_step_count(input_file_path, output_json_path)

print("Daily Step Count JSON File:", result_json_path)


Daily Step Count JSON File: DailyStepCount.json


In [8]:
# Define file paths
input_file_path = 'cleaned_grouped_health_data/Physical_Activity/HKQuantityTypeIdentifierBasalEnergyBurned.xml'
output_json_path = 'BasalEnergyBurnedDaily.json'

In [9]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import json

# Function to process basal energy burned data and save daily totals to JSON
def process_basal_energy_burned(file_path, output_json_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data
    data = []
    for record in root.findall('Record'):
        start_date = datetime.strptime(record.get('startDate'), "%Y-%m-%d %H:%M:%S %z")
        end_date = datetime.strptime(record.get('endDate'), "%Y-%m-%d %H:%M:%S %z")
        value = float(record.get('value', 0))
        duration = (end_date - start_date).total_seconds() / 3600.0  # Duration in hours
        total_energy = value * duration  # Energy burned = value * time
        data.append((start_date.date(), total_energy))

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'TotalEnergy'])

    # Calculate total energy burned for each day
    daily_totals = df.groupby('Date').sum().reset_index()
    daily_totals.columns = ['Date', 'TotalBasalEnergyBurned']

    # Convert Date column to string for JSON serialization
    daily_totals['Date'] = daily_totals['Date'].astype(str)

    # Convert to a list of dictionaries
    records = daily_totals.to_dict(orient='records')

    # Save results to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(records, json_file, indent=4)

    return output_json_path



# Process the file and save to JSON
result_json_path = process_basal_energy_burned(input_file_path, output_json_path)

print("Basal Energy Burned JSON File:", result_json_path)


Basal Energy Burned JSON File: BasalEnergyBurnedDaily.json


In [10]:
import json
from datetime import datetime
from statistics import mean

# Load JSON data
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Calculate mean for a given key in a dataset
def calculate_mean(data, key):
    return mean(entry[key] for entry in data if key in entry and entry[key] is not None)

# Merge data based on exposure type
def merge_data(exposure_data, other_data, exposure_label):
    merged_data = []

    # Calculate mean values for missing data
    mean_values = {
        "step count": calculate_mean(other_data["step_count"], "TotalSteps"),
        "active energy burned": calculate_mean(other_data["active_energy"], "DailyAverage"),
        "basal energy burned": calculate_mean(other_data["basal_energy"], "TotalBasalEnergyBurned"),
        "walking speed": calculate_mean(other_data["walking_speed"], "DailyWeightedAverageSpeed"),
    }

    for entry in exposure_data:
        date = entry["Date"]
        exposure = entry["DailyAverage"]

        # Look up values from other datasets
        step_count = next((x["TotalSteps"] for x in other_data["step_count"] if x["Date"] == date), mean_values["step count"])
        active_energy = next((x["DailyAverage"] for x in other_data["active_energy"] if x["Date"] == date), mean_values["active energy burned"])
        basal_energy = next((x["TotalBasalEnergyBurned"] for x in other_data["basal_energy"] if x["Date"] == date), mean_values["basal energy burned"])
        walking_speed = next((x["DailyWeightedAverageSpeed"] for x in other_data["walking_speed"] if x["Date"] == date), mean_values["walking speed"])

        # Append merged data
        merged_data.append({
            "date": date,
            "exposure": exposure,
            "step count": step_count,
            "active energy burned": active_energy,
            "basal energy burned": basal_energy,
            "walking speed": walking_speed,
        })

    return merged_data

# Main function
def main():
    # Load all files
    low_exposure = load_json("LowExposureDailyAverages.json")
    high_exposure = load_json("HighExposureDailyAverages.json")
    other_data = {
        "step_count": load_json("DailyStepCount.json"),
        "active_energy": load_json("ActiveEnergyBurnedDaily.json"),
        "basal_energy": load_json("BasalEnergyBurnedDaily.json"),
        "walking_speed": load_json("WalkingSpeedDailyAverages.json"),
    }

    # Merge data
    low_exposure_merged = merge_data(low_exposure, other_data, "low")
    high_exposure_merged = merge_data(high_exposure, other_data, "high")

    # Combine results
    combined_data = {"low_exposure": low_exposure_merged, "high_exposure": high_exposure_merged}

    # Save to JSON
    with open("MergedExposureData.json", "w") as f:
        json.dump(combined_data, f, indent=4)
        print("Exposure dates are merged.")

if __name__ == "__main__":
    main()


Exposure dates are merged.


In [11]:
import json
from datetime import datetime
from statistics import mean

# Load JSON data
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Calculate mean for a given key in a dataset
def calculate_mean(data, key):
    return mean(entry[key] for entry in data if key in entry and entry[key] is not None)

# Merge data based on exposure type
def merge_data(exposure_data, other_data, exposure_label):
    merged_data = []

    # Calculate mean values for missing data
    mean_values = {
        "step count": calculate_mean(other_data["step_count"], "TotalSteps"),
        "active energy burned": calculate_mean(other_data["active_energy"], "DailyAverage"),
        "basal energy burned": calculate_mean(other_data["basal_energy"], "TotalBasalEnergyBurned"),
        "walking speed": calculate_mean(other_data["walking_speed"], "DailyWeightedAverageSpeed"),
    }

    for entry in exposure_data:
        date = entry["Date"]
        exposure = entry["DailyAverage"]

        # Look up values from other datasets
        step_count_entry = next((x for x in other_data["step_count"] if x["Date"] == date), None)
        active_energy_entry = next((x for x in other_data["active_energy"] if x["Date"] == date), None)
        basal_energy_entry = next((x for x in other_data["basal_energy"] if x["Date"] == date), None)
        walking_speed_entry = next((x for x in other_data["walking_speed"] if x["Date"] == date), None)

        # Assign values or means with an imputation flag
        step_count = step_count_entry["TotalSteps"] if step_count_entry else mean_values["step count"]
        step_count_imputed = step_count_entry is None

        active_energy = active_energy_entry["DailyAverage"] if active_energy_entry else mean_values["active energy burned"]
        active_energy_imputed = active_energy_entry is None

        basal_energy = basal_energy_entry["TotalBasalEnergyBurned"] if basal_energy_entry else mean_values["basal energy burned"]
        basal_energy_imputed = basal_energy_entry is None

        walking_speed = walking_speed_entry["DailyWeightedAverageSpeed"] if walking_speed_entry else mean_values["walking speed"]
        walking_speed_imputed = walking_speed_entry is None

        # Append merged data with flags
        merged_data.append({
            "date": date,
            "exposure": exposure,
            "step count": step_count,
            "step count imputed": step_count_imputed,
            "active energy burned": active_energy,
            "active energy burned imputed": active_energy_imputed,
            "basal energy burned": basal_energy,
            "basal energy burned imputed": basal_energy_imputed,
            "walking speed": walking_speed,
            "walking speed imputed": walking_speed_imputed,
        })

    return merged_data

# Main function
def main():
    # Load all files
    low_exposure = load_json("LowExposureDailyAverages.json")
    high_exposure = load_json("HighExposureDailyAverages.json")
    other_data = {
        "step_count": load_json("DailyStepCount.json"),
        "active_energy": load_json("ActiveEnergyBurnedDaily.json"),
        "basal_energy": load_json("BasalEnergyBurnedDaily.json"),
        "walking_speed": load_json("WalkingSpeedDailyAverages.json"),
    }

    # Merge data
    low_exposure_merged = merge_data(low_exposure, other_data, "low")
    high_exposure_merged = merge_data(high_exposure, other_data, "high")

    # Combine results
    combined_data = {"low_exposure": low_exposure_merged, "high_exposure": high_exposure_merged}

    # Save to JSON
    with open("MergedExposureData.json", "w") as f:
        json.dump(combined_data, f, indent=4)
        print("Exposure data merged with imputation flags.")

if __name__ == "__main__":
    main()


Exposure data merged with imputation flags.
