In [211]:
import os
import re
import glob
import csv
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [None]:
def load_and_parse_html_files(directory):
    """
    Load and parse (using BeautifulSoep) all HTML files in the specified directory.
    Args:
        directory (str): The path to the directory containing HTML files.
    Returns:
        tuple: A tuple containing:
            - html_files (list): A list of paths to the HTML files found.
            - soups (list): A list of BeautifulSoup objects representing the parsed HTML files.
    """
    html_files = glob.glob(os.path.join(directory, '*.html'))
    if not html_files:
        raise FileNotFoundError(f"No HTML files found in directory: {directory}")
    
    print(f"Found {len(html_files)} HTML files in directory: {directory}")

    soups = []
    # Open each HTML file
    for file_path in html_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                soups.append(BeautifulSoup(content, 'html.parser'))
                print(f"Processed file: {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    return html_files, soups

In [None]:
def find_begin_and_end_timestamps(soup):
    """
    Find the first and last timestamps of the graph (see README.md) in the HTML soup object.
    Args:
        soup: The BeautifulSoup object containing the HTML content.
    Returns:
        tuple: A tuple containing the first and last timestamps as datetime.datetime objects.
    """

    # Example of div with start and end time: <div class="    flexboxgrid_colXs__2BUM1      flexboxgrid_collapse__v6Hdm  "><span>12:33 AM</span><span class="SleepTimeEditors_pencilIcon__ejpUi"><i class="icon-pencil undefined" style="font-size: 14px;"></i></span></div>
    time_elements = soup.find_all("div", class_="flexboxgrid_colXs__2BUM1")
    time_elements = [time_element.text.strip() for time_element in time_elements]
    # Convert list of text strings (e.g. "12:33 AM") to list of datetime.datetime objects
    time_elements = [datetime.strptime(time_element, "%I:%M %p") for time_element in time_elements]  # Convert text string (e.g. "12:33 AM") to datetime.datetime object

    # Check if start time is in the evening and the end time is in the morning, this fixes the circular issue (going to bed at 22:00 and waking up at 07:00 would result in timedelta of 15 hours)
    if time_elements[0].hour >= 12 and time_elements[1].hour < 12:
        # If the first time is in the evening and the last time is in the morning, we assume the last time is on the next day
        time_elements[1] = time_elements[1] + timedelta(days=1)

    # Check if we have exactly two time elements (start and stop time)
    if len(time_elements) != 2:
        raise ValueError(f"Found {len(time_elements)} time elements, expected exactly two (i.e. start and stop time).")
    
    return time_elements

# Test the functions thusfar
_, soups = load_and_parse_html_files('html')
for soup in soups:
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time.time()}, End time: {end_time.time()}")

Found 3 HTML files in directory: html
Processed file: html\09_04_2025.html
Processed file: html\10_04_2025.html
Processed file: html\13_04_2025.html
Start time: 23:51:00, End time: 08:24:00
Start time: 00:33:00, End time: 08:24:00
Start time: 22:29:00, End time: 07:42:00


In [None]:
def get_timestamps_for_each_sleepcycle(soup, start_time, end_time):
    """
    Get the timestamps for each sleep cycle from the parsed HTML soups. This is done by assessing the relative locations of the coloured rectangles in the graph. Returns start and end times of each sleep cycle, including which sleep cycle label.
    Args:
        soup: The BeautifulSoup object containing the HTML content.
        start_time: datetime.time: The start time of the sleep.
        end_time: datetime.time: The end time of the sleep.
    Returns:
        list: A list of tuples, each containing the start and end timestamps and label of a sleep cycle.
    """
    # The chart is dynamic, based on the size of the screen, so lets use the vertical dashed lines to find the (relative) horizontal start and end position of the graph.
    # These lines are found in a <g> element with class="highcharts-plot-lines-5", with two <path> elements inside with class="highcharts-plot-line ", each with a "d" attribute containing the coordinates of the line.
    vertical_lines = soup.find("g", class_="highcharts-plot-lines-5")    
    vertical_lines = vertical_lines.find_all("path", class_="highcharts-plot-line")
    if not len(vertical_lines) == 2:
        raise ValueError(f"Expected exactly two vertical lines in the graph, found: {len(vertical_lines)}")
    # The first line contains the relative start time, the second line contains the relative end time.
    relative_start_time = float(vertical_lines[0].get("d").split()[-2])
    relative_end_time = float(vertical_lines[1].get("d").split()[-2])

    # Make start time 0, and correct the end time accordingly
    relative_end_time = relative_end_time - relative_start_time
    relative_start_time = relative_start_time - relative_start_time

    # The rectangles of a graph are <g></g> elements, each with multiple <path></path> elements inside. Finding the <g></g> elements using class names such as: [""highcharts-series highcharts-series-0 highcharts-area-series", "highcharts-series highcharts-series-1 highcharts-area-series"]
    pattern = re.compile(r"highcharts-series highcharts-series-\d+ highcharts-area-series")
    rectangles = soup.find_all("g", class_=pattern)
    if not rectangles:
        raise ValueError("No rectangles found in the graph. Please check the HTML structure or class names.")
    
    # print(len(rectangles), "rectangles found in the graph.")

    # Each of these <g></g> elements, in its first child element (<path></path>), the "d" attribute contains the coordinates of the rectangle in the graph. The number after the first L is the starting x coordinate of the rectangle, and the number after the second L is the ending x coordinate of the rectangle.
    # Moreover, the colour (fill) of first child element (<path></path>) indicates the type of sleep cycle (e.g. #3b97f3 = light, #1976d2 = deep, #d42fc2 = REM, #e55ecb = Awake, transparant = Unmeasurable).
    sleep_labels = {
        "#3b97f3": "Light Sleep",
        "#1976d2": "Deep Sleep",
        "#d42fc2": "REM Sleep",
        "#e55ecb": "Awake",
        "transparent": "Unmeasurable"
    }

    sleep_cycles = []
    for rectangle in rectangles:
        path = rectangle.find("path")  # Contains the coordinates and fill colour of the rectangle
        path_d = path.get("d")  # The "d" attribute contains the coordinates of the rectangle

        # Extract the x coordinates from the "d" attribute, by first splitting the string into tokens
        tokens = path_d.split()

        # Find the first occurance of "L" , the number (now token) after it contains start coordinate
        start_coordinate = float(tokens[tokens.index('L') + 1])

        # Get the sleep cycle label based on the fill colour
        path_fill = sleep_labels[path.get("fill")]

        sleep_cycles.append([start_coordinate, path_fill])

    # Convert the relative start and end times to absolute times
    # First we normalize the starting times
    sleep_cycles = [[sleep_cycle[0]/relative_end_time, sleep_cycle[1]] for sleep_cycle in sleep_cycles]
    # Then calculate the timedelta from the provided start and end times
    timedelta = end_time - start_time
    # print(end_time, start_time, timedelta)
    # Add the weighted timedelta to the start time 
    sleep_cycles = [[(start_time + timedelta * sleep_cycle[0]), sleep_cycle[1]] for sleep_cycle in sleep_cycles]

    sleep_cycles.append([end_time, "End of Sleep"])  # Append the end time of the sleep

    return sleep_cycles

# Test
for soup in soups[:1]:
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time}, End time: {end_time}, Time delta: {end_time - start_time}")
    sleep_cycles = get_timestamps_for_each_sleepcycle(soup, start_time, end_time)

    for sleep_cycle in sleep_cycles:
        print(sleep_cycle)




Start time: 1900-01-01 23:51:00, End time: 1900-01-02 08:24:00, Time delta: 8:33:00
[datetime.datetime(1900, 1, 1, 23, 51), 'Light Sleep']
[datetime.datetime(1900, 1, 1, 23, 59), 'Deep Sleep']
[datetime.datetime(1900, 1, 2, 0, 45), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 0, 58), 'Deep Sleep']
[datetime.datetime(1900, 1, 2, 1, 15), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 1, 24), 'Deep Sleep']
[datetime.datetime(1900, 1, 2, 1, 32), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 1, 49), 'REM Sleep']
[datetime.datetime(1900, 1, 2, 2, 8), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 2, 22), 'REM Sleep']
[datetime.datetime(1900, 1, 2, 2, 23), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 2, 24), 'REM Sleep']
[datetime.datetime(1900, 1, 2, 2, 25), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 3, 49), 'Deep Sleep']
[datetime.datetime(1900, 1, 2, 4, 1), 'Light Sleep']
[datetime.datetime(1900, 1, 2, 4, 16), 'REM Sleep']
[datetime.datetime(1900, 1, 2, 5, 10), 'Light Sleep']
[datetime.

In [215]:
# Save the sleep cycles to a CSV file
def save_sleep_cycles_to_csv(sleep_cycles, output_file, directory='csv'):
    """
    Save the sleep cycles to a CSV file.
    Args:
        sleep_cycles (list): A list of tuples containing the start and end timestamps and label of each sleep cycle.
        output_file (str): The path to the output CSV file.
        directory (str): The directory where the CSV file will be saved.
    """
    file_path = os.path.join(directory, output_file)
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists

    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Start Time', 'Sleep Cycle'])  # Write header
        for sleep_cycle in sleep_cycles:
            writer.writerow([sleep_cycle[0], sleep_cycle[1]])
    print(f"Saved to {file_path}")

In [216]:
# The full thing
file_names, soups = load_and_parse_html_files('html')
for soup, file_name in zip(soups, file_names):
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time}, End time: {end_time}, Time delta: {end_time - start_time}")
    sleep_cycles = get_timestamps_for_each_sleepcycle(soup, start_time, end_time)
    save_sleep_cycles_to_csv(sleep_cycles, os.path.basename(file_name).replace('.html', '.csv'))


Found 3 HTML files in directory: html
Processed file: html\09_04_2025.html
Processed file: html\10_04_2025.html
Processed file: html\13_04_2025.html
Start time: 1900-01-01 23:51:00, End time: 1900-01-02 08:24:00, Time delta: 8:33:00
Saved to csv\09_04_2025.csv
Start time: 1900-01-01 00:33:00, End time: 1900-01-01 08:24:00, Time delta: 7:51:00
Saved to csv\10_04_2025.csv
Start time: 1900-01-01 22:29:00, End time: 1900-01-02 07:42:00, Time delta: 9:13:00
Saved to csv\13_04_2025.csv
