In [None]:
import os
import re
import glob
import csv
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [51]:
def load_and_parse_html_files(directory):
    """
    Load and parse (using BeautifulSoep) all HTML files in the specified directory.
    Args:
        directory (str): The path to the directory containing HTML files.
    Returns:
        tuple: A tuple containing:
            - html_files (list): A list of paths to the HTML files found.
            - soups (list): A list of BeautifulSoup objects representing the parsed HTML files.
    """
    html_files = glob.glob(os.path.join(directory, '*.html'))
    if not html_files:
        raise FileNotFoundError(f"No HTML files found in directory: {directory}")
    
    print(f"Found {len(html_files)} HTML files in directory: {directory}")

    soups = []
    # Open each HTML file
    for file_path in html_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                soups.append(BeautifulSoup(content, 'html.parser'))
                print(f"Processed file: {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    return html_files, soups

In [52]:
def find_begin_and_end_timestamps(soup):
    """
    Find the first and last timestamps of the graph (see README.md) in the HTML soup object.
    Args:
        soup: The BeautifulSoup object containing the HTML content.
    Returns:
        tuple: A tuple containing the first and last timestamps as datetime.datetime objects.
    """
    # Find starting day <span> element with class "NavigationComponent_datePickerBoxContainer__q-LPO", then extracting the text from its last <span> child element
    start_day = soup.find("span", class_="NavigationComponent_datePickerBoxContainer__q-LPO").find_all("span")[-1].text.strip()
    # Convert the start day text (e.g. Apr 9, 2022) to a datetime object
    start_day = datetime.strptime(start_day, "%b %d, %Y")

    # Now find the time elements in the HTML soup
    # Example of div with start and end time: <div class="    flexboxgrid_colXs__2BUM1      flexboxgrid_collapse__v6Hdm  "><span>12:33 AM</span><span class="SleepTimeEditors_pencilIcon__ejpUi"><i class="icon-pencil undefined" style="font-size: 14px;"></i></span></div>
    time_elements = soup.find_all("div", class_="flexboxgrid_colXs__2BUM1")
    time_elements = [time_element.text.strip() for time_element in time_elements]
    # Convert list of text strings (e.g. "12:33 AM") to list of datetime.datetime objects
    time_elements = [datetime.strptime(time_element, "%I:%M %p").time() for time_element in time_elements]  # Convert text string (e.g. "12:33 AM") to datetime.datetime object

    # Combine the start day with the time elements to create complete datetime.datetime objects
    datetimes = [datetime.combine(start_day, time_element) for time_element in time_elements]

    # Check if start time is in the evening and the end time is in the morning, this fixes the circular issue (going to bed at 22:00 and waking up at 07:00 would result in timedelta of 15 hours)
    if time_elements[0].hour >= 12 and time_elements[1].hour < 12:
        # If the first time is in the evening and the last time is in the morning, we assume the last time is on the next day
        datetimes[1] = datetimes[1] + timedelta(days=1)

    # Check if we have exactly two time elements (start and stop time)
    if len(time_elements) != 2:
        raise ValueError(f"Found {len(time_elements)} time elements, expected exactly two (i.e. start and stop time).")
    
    return datetimes

# Test the functions thusfar
_, soups = load_and_parse_html_files('html')
for soup in soups:
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time.time()}, End time: {end_time.time()}")

Found 3 HTML files in directory: html
Processed file: html\09_04_2025.html
Processed file: html\10_04_2025.html
Processed file: html\13_04_2025.html
Start time: 23:51:00, End time: 08:24:00
Start time: 00:33:00, End time: 08:24:00
Start time: 22:29:00, End time: 07:42:00


In [53]:
def get_timestamps_for_each_sleepcycle(soup, start_time, end_time):
    """
    Get the timestamps for each sleep cycle from the parsed HTML soups. This is done by assessing the relative locations of the coloured rectangles in the graph. Returns start times each sleep cycle with corresponding sleep cycle label.
    Args:
        soup: The BeautifulSoup object containing the HTML content.
        start_time: datetime.time: The start time of the sleep.
        end_time: datetime.time: The end time of the sleep.
    Returns:
        list: A list of lists, each containing the start time of a sleep cycle and the corresponding sleep cycle label.
    """
    # The width of the graph is used to calculate the relative positions of the sleep cycles (as it is equal to the relative end time).
    # A <rect> element with class="highcharts-plot-background" and "fill=none" holds the width of the graph in a width attribute.
    graph_width = float(soup.find("rect", class_="highcharts-plot-background", fill="none").get("width"))

    # The rectangles of a graph are <g> elements, each with multiple <path> elements inside.
    # Finding those <g> elements using class names such as: [""highcharts-series highcharts-series-0 highcharts-area-series", "highcharts-series highcharts-series-1 highcharts-area-series"]
    pattern = re.compile(r"highcharts-series highcharts-series-\d+ highcharts-area-series")
    rectangles = soup.find_all("g", class_=pattern)
    if not rectangles:
        raise ValueError("No rectangles found in the graph. Please check the HTML structure or class names.")
    
    # Each of these <g> elements, in its first child element (a <path> element) the "d" attribute contains the coordinates of the rectangle in the graph. The number after the first L is the starting x coordinate of the rectangle (and thus the start time of that sleep cycle).
    # Moreover, the colour (fill) of the first child element (a <path> element) indicates the type of sleep cycle (e.g. #3b97f3 = light, #1976d2 = deep, #d42fc2 = REM, #e55ecb = Awake, transparant = Unmeasurable).
    sleep_labels = {
        "#3b97f3": "Light Sleep",
        "#1976d2": "Deep Sleep",
        "#d42fc2": "REM Sleep",
        "#e55ecb": "Awake",
        "transparent": "Unmeasurable"
    }

    sleep_cycles = []
    for rectangle in rectangles:
        path = rectangle.find("path")  # Contains the coordinates and fill colour of the rectangle

        sleep_cycle_label = sleep_labels[path.get("fill")]  # Convert colour to label

        # The "d" attribute contains the coordinates of the rectangle
        tokens = path.get("d").split()  # Extract the x coordinates from the "d" attribute, by first splitting the string into tokens
        # Find the first occurance of "L", the number (now token) after it contains start coordinate
        start_coordinate = float(tokens[tokens.index('L') + 1])

        sleep_cycles.append([start_coordinate, sleep_cycle_label])

    # Convert the relative start to absolute times, by first normalizing the start times (0 to 1) based on the graph width
    sleep_cycles = [[sleep_cycle[0]/graph_width, sleep_cycle[1]] for sleep_cycle in sleep_cycles]
    timedelta = end_time - start_time  # Absolute time difference between start and end time
    # Add the weighted timedelta to the start time 
    sleep_cycles = [[(start_time + timedelta * sleep_cycle[0]), sleep_cycle[1]] for sleep_cycle in sleep_cycles]

    sleep_cycles.append([end_time, "End of Sleep"])  # Append the end time of the sleep

    return sleep_cycles

# Test the functions
for soup in soups[:1]: # Limit to first soup for testing
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time}, End time: {end_time}, Time delta: {end_time - start_time}")
    sleep_cycles = get_timestamps_for_each_sleepcycle(soup, start_time, end_time)

    for sleep_cycle in sleep_cycles:
        print(sleep_cycle)



Start time: 2022-04-09 23:51:00, End time: 2022-04-10 08:24:00, Time delta: 8:33:00
[datetime.datetime(2022, 4, 9, 23, 51), 'Light Sleep']
[datetime.datetime(2022, 4, 9, 23, 59), 'Deep Sleep']
[datetime.datetime(2022, 4, 10, 0, 45), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 0, 58), 'Deep Sleep']
[datetime.datetime(2022, 4, 10, 1, 15), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 1, 24), 'Deep Sleep']
[datetime.datetime(2022, 4, 10, 1, 32), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 1, 49), 'REM Sleep']
[datetime.datetime(2022, 4, 10, 2, 8), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 2, 22), 'REM Sleep']
[datetime.datetime(2022, 4, 10, 2, 23), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 2, 24), 'REM Sleep']
[datetime.datetime(2022, 4, 10, 2, 25), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 3, 49), 'Deep Sleep']
[datetime.datetime(2022, 4, 10, 4, 1), 'Light Sleep']
[datetime.datetime(2022, 4, 10, 4, 16), 'REM Sleep']
[datetime.datetime(2022, 4, 10, 5, 10), 'Light Sle

In [54]:
# Save the sleep cycles to a CSV file
def save_sleep_cycles_to_csv(sleep_cycles, output_file, directory='csv'):
    """
    Save the sleep cycles to a CSV file.
    Args:
        sleep_cycles (list): A list of tuples containing the start timestamps and corresponding label of each sleep cycle.
        output_file (str): The path to the output CSV file.
        directory (str): The directory where the CSV file will be saved (default is 'csv').
    """
    file_path = os.path.join(directory, output_file)
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists

    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Start Time', 'Sleep Cycle'])  # Write header
        for sleep_cycle in sleep_cycles:
            writer.writerow([sleep_cycle[0], sleep_cycle[1]])
    print(f"Saved to {file_path}")

In [55]:
# The full thing
file_names, soups = load_and_parse_html_files('html')
for soup, file_name in zip(soups, file_names):
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time}, End time: {end_time}, Time delta: {end_time - start_time}")
    sleep_cycles = get_timestamps_for_each_sleepcycle(soup, start_time, end_time)
    save_sleep_cycles_to_csv(sleep_cycles, os.path.basename(file_name).replace('.html', '.csv'))


Found 3 HTML files in directory: html
Processed file: html\09_04_2025.html
Processed file: html\10_04_2025.html
Processed file: html\13_04_2025.html
Start time: 2022-04-09 23:51:00, End time: 2022-04-10 08:24:00, Time delta: 8:33:00
Saved to csv\09_04_2025.csv
Start time: 2022-04-10 00:33:00, End time: 2022-04-10 08:24:00, Time delta: 7:51:00
Saved to csv\10_04_2025.csv
Start time: 2022-04-13 22:29:00, End time: 2022-04-14 07:42:00, Time delta: 9:13:00
Saved to csv\13_04_2025.csv
