In [3]:
import os
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup

# GitHub repo where XML files are stored
GITHUB_REPO_URL = "https://github.com/Az0202/Blood-Glucose-Level-Prediction/tree/main/DATA/xml_files"
RAW_BASE_URL = "https://raw.githubusercontent.com/Az0202/Blood-Glucose-Level-Prediction/main/DATA/xml_files"

# Create a directory to store XML files
XML_DIR = "xml_data"
os.makedirs(XML_DIR, exist_ok=True)

# Get list of XML files from GitHub
def get_xml_file_list():
    response = requests.get(GITHUB_REPO_URL)
    if response.status_code != 200:
        print("Failed to retrieve XML file list.")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    xml_files = [a.text for a in soup.find_all("a") if a.text.endswith(".xml")]
    return xml_files

# Download XML files from GitHub
def download_xml_files():
    xml_files = get_xml_file_list()

    if not xml_files:
        print("No XML files found.")
        return

    for file_name in xml_files:
        file_url = f"{RAW_BASE_URL}/{file_name}"
        response = requests.get(file_url)

        if response.status_code == 200:
            file_path = os.path.join(XML_DIR, file_name)
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {file_name}")
        else:
            print(f"Failed to download: {file_name}")

# Parse XML data
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    patient_id = root.attrib.get("id")
    weight = root.attrib.get("weight")
    insulin_type = root.attrib.get("insulin_type")

    # Extract glucose levels
    glucose_data = []
    for event in root.find("glucose_level"):
        glucose_data.append({
            "patient_id": patient_id,
            "timestamp": event.attrib["ts"],
            "glucose_level": event.attrib["value"],
            "weight": weight,
            "insulin_type": insulin_type
        })

    # Extract step counts
    step_data = []
    basis_steps = root.find("basis_steps")
    if basis_steps is not None:
        for event in basis_steps.findall("event"):
            step_data.append({
                "patient_id": patient_id,
                "timestamp": event.attrib["ts"],
                "steps": event.attrib["value"]
            })

    # Extract sleep data
    sleep_data = []
    basis_sleep = root.find("basis_sleep")
    if basis_sleep is not None:
        for event in basis_sleep.findall("event"):
            sleep_data.append({
                "patient_id": patient_id,
                "start_time": event.attrib["tbegin"],
                "end_time": event.attrib["tend"],
                "quality": event.attrib["quality"]
            })

    return glucose_data, step_data, sleep_data

# Process all XML files
def process_xml_files():
    glucose_list, step_list, sleep_list = [], [], []

    for filename in os.listdir(XML_DIR):
        if filename.endswith(".xml"):
            file_path = os.path.join(XML_DIR, filename)
            glucose_data, step_data, sleep_data = parse_xml(file_path)
            glucose_list.extend(glucose_data)
            step_list.extend(step_data)
            sleep_list.extend(sleep_data)

    # Convert lists to DataFrames
    df_glucose = pd.DataFrame(glucose_list)
    df_steps = pd.DataFrame(step_list)
    df_sleep = pd.DataFrame(sleep_list)

    # Save data to CSV
    df_glucose.to_csv("glucose_levels.csv", index=False)
    df_steps.to_csv("step_counts.csv", index=False)
    df_sleep.to_csv("sleep_data.csv", index=False)

    print("CSV files saved successfully.")

    return df_glucose, df_steps, df_sleep

# Run the script
download_xml_files()
df_glucose, df_steps, df_sleep = process_xml_files()

# Display DataFrames in Google Colab
from IPython.display import display

print("Glucose Levels Data:")
display(df_glucose.head())  # Show first few rows

print("Step Counts Data:")
display(df_steps.head())  # Show first few rows

print("Sleep Data:")
display(df_sleep.head())  # Show first few r


Downloaded: 559-ws-testing.xml
Downloaded: 559-ws-testing.xml
Downloaded: 559-ws-training.xml
Downloaded: 559-ws-training.xml
Downloaded: 563-ws-testing.xml
Downloaded: 563-ws-testing.xml
Downloaded: 563-ws-training.xml
Downloaded: 563-ws-training.xml
Downloaded: 570-ws-testing.xml
Downloaded: 570-ws-testing.xml
Downloaded: 570-ws-training.xml
Downloaded: 570-ws-training.xml
Downloaded: 575-ws-testing.xml
Downloaded: 575-ws-testing.xml
Downloaded: 575-ws-training.xml
Downloaded: 575-ws-training.xml
Downloaded: 588-ws-testing.xml
Downloaded: 588-ws-testing.xml
Downloaded: 588-ws-training.xml
Downloaded: 588-ws-training.xml
Downloaded: 591-ws-testing.xml
Downloaded: 591-ws-testing.xml
Downloaded: 591-ws-training.xml
Downloaded: 591-ws-training.xml
CSV files saved successfully.
Glucose Levels Data:


Unnamed: 0,patient_id,timestamp,glucose_level,weight,insulin_type
0,570,07-12-2021 16:29:00,101,99,Humalog
1,570,07-12-2021 16:34:00,100,99,Humalog
2,570,07-12-2021 16:39:00,100,99,Humalog
3,570,07-12-2021 16:44:00,99,99,Humalog
4,570,07-12-2021 16:49:00,98,99,Humalog


Step Counts Data:


Unnamed: 0,patient_id,timestamp,steps
0,570,07-12-2021 14:51:00,0
1,570,07-12-2021 14:56:00,0
2,570,07-12-2021 15:01:00,0
3,570,07-12-2021 15:06:00,0
4,570,07-12-2021 15:11:00,58


Sleep Data:


Unnamed: 0,patient_id,start_time,end_time,quality
0,570,07-12-2021 22:57:00,07-12-2021 22:59:00,89
1,570,07-12-2021 22:59:00,07-12-2021 23:11:00,89
2,570,07-12-2021 23:11:00,07-12-2021 23:14:00,89
3,570,07-12-2021 23:14:00,07-12-2021 23:21:00,89
4,570,07-12-2021 23:21:00,07-12-2021 23:24:00,89
