In [20]:
import os
import glob
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
def load_and_parse_html_files(directory):
    """
    Load and parse all HTML files in the specified directory.
    Args:
        directory (str): The path to the directory containing HTML files.
    Returns:
        list: A list of BeautifulSoup objects representing the parsed HTML files.
    """
    html_files = glob.glob(os.path.join(directory, '*.html'))
    if not html_files:
        raise FileNotFoundError(f"No HTML files found in directory: {directory}")
    else:
        print(f"Found {len(html_files)} HTML files in directory: {directory}")

    soups = []
    # Open each HTML file
    for file_path in html_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                soups.append(BeautifulSoup(content, 'html.parser'))
                # You can process the soup object as needed
                print(f"Processed file: {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    return soups

In [None]:
def find_begin_and_end_timestamps(soup):
    """
    Find the first and last timestamps of the graph (see README.md) in the HTML soup object.
    Args:
        soup (BeautifulSoup): The BeautifulSoup object containing the HTML content.
    Returns:
        tuple: A tuple containing the first and last timestamps as numpy datetime64 objects.
    """

    # Example of div with (start) time: <div class="    flexboxgrid_colXs__2BUM1      flexboxgrid_collapse__v6Hdm  "><span>12:33 AM</span><span class="SleepTimeEditors_pencilIcon__ejpUi"><i class="icon-pencil undefined" style="font-size: 14px;"></i></span></div>
    time_elements = soup.find_all("div", class_="flexboxgrid_colXs__2BUM1")
    time_elements = [time_element.text.strip() for time_element in time_elements]
    #   # Convert list of text strings (e.g. "12:33 AM") to list of datetime.time objects
    time_elements = [datetime.strptime(time_element, "%I:%M %p").time() for time_element in time_elements]  # Convert text string (e.g. "12:33 AM")

    # Check if we have exactly two time elements (start and stop time)
    if len(time_elements) != 2:
        raise ValueError(f"Found {len(time_elements)} time elements, expected exactly two (i.e. start and stop time).")
    
    return time_elements

# Test the functions
soups = load_and_parse_html_files('html')
for soup in soups:
    start_time, end_time = find_begin_and_end_timestamps(soup)
    print(f"Start time: {start_time}, End time: {end_time}")

Found 3 HTML files in directory: html
Processed file: html\09_04_2025.html
Processed file: html\10_04_2025.html
Processed file: html\13_04_2025.html
Start time: 23:51:00, End time: 08:24:00
Start time: 00:33:00, End time: 08:24:00
Start time: 22:29:00, End time: 07:42:00


In [None]:
def get_timestamps_for_each_sleepcycle(soup, start_time, end_time):
    """
    Get the timestamps for each sleep cycle from the parsed HTML soups. This is done by assessing the relative locations of the coloured rectangles in the graph. Returns start and end times of each sleep cycle, including which sleep cycle label.
    Args:
        soup: BeautifulSoup: The BeautifulSoup object containing the HTML content.
        start_time: datetime.time: The start time of the sleep.
        end_time: datetime.time: The end time of the sleep.
    Returns:
        list: A list of tuples, each containing the start and end timestamps and label of a sleep cycle.
    """
    # Each rectangle is a <g></g> with multiple <g></g> elements, each with multiple <path></path> elements inside it. First find that upper <g></g>, for example with class="highcharts-series-group"
    graph = soup.find_all("g", class_="highcharts-series-group")

    if len(graph) != 2:
        raise ValueError(f"Found {len(graph)} graphs HTML soup, expected exactly two (i.e. the pie chart at the top of the page, and the sleep graph of interest).")

    graph = graph[1]  # Get the second graph, which is the sleep graph (of interest)

    # Then in that graph find all <g></g> that construct the rectangles of the sleep cycles, for with class such as: [""highcharts-series highcharts-series-0 highcharts-area-series", "highcharts-series highcharts-series-1 highcharts-area-series"]
    rectangles = []
    i = 0
    while True:
        rectangle = graph.find_all("g", class_=f"highcharts-series highcharts-series-{i} highcharts-area-series")
        if not rectangle:
            break
        rectangles.append(rectangle)
        i += 1
    
    print(len(rectangles), "rectangles found in the graph.")

    return None # Placeholder for the actual implementation

get_timestamps_for_each_sleepcycle(soups[0], start_time, end_time)

2 graphs found in the HTML soup.


ValueError: Found 2 graphs HTML soup, expected exactly one (i.e. the sleep graph).