In [None]:
from src.data.data_loader import load_data
from src.data.data_cleaner import clean_data
import pandas as pd
import numpy as np
import datetime
from datetime import datetime  # noqa: F811

In [64]:
data = clean_data(data=load_data(), data_source_name="kaggle_brisT1D")

  return pd.read_csv(file_path, usecols=keep_columns)


In [None]:
def calculate_physiological_sleep_score(row: pd.Series) -> float:
    """
    Calculates sleep probability based purely on physiological indicators,
    without making assumptions about normal sleep times
    """
    # Get current values
    hr = row["hr-0:00"]
    steps = row["steps-0:00"]
    cals = row["cals-0:00"]

    # Calculate individual indicator scores

    # hr
    # look for relatively low heart rate compared to typical range (about 25% less)
    hr_score = np.exp(-hr / 40)  # Exponential decay, stronger response to low HR

    # steps
    # 0 steps is a very strong sleep indicator
    steps_score = 1.0 if steps == 0 else np.exp(-steps / 10)

    # cals
    # low cal bur rate suggests sleep
    # less weight as it's less reliable
    cal_score = np.exp(-cals / 40)

    # weight sum of scores based on reliability
    combined_score = 0.6 * steps_score + 0.3 * hr_score + 0.1 * cal_score

    return np.clip(combined_score, 0, 1)


def detect_sleep_clusters(data: pd.DataFrame, patient_id: int) -> dict:
    """
    Analyzes a patient's data to find their typical rest periods,
    without assuming specific sleep times
    """
    patient_data = data[data["p_num"] == patient_id].copy()

    # group data by hour to find typical rest periods
    patient_data["hour"] = pd.to_datetime(patient_data["time"]).dt.hour

    # calculate average activity levels by hour
    hourly_stats = patient_data.groupby("hour").agg(
        {"steps-0:00": "mean", "hr-0:00": "mean", "cals-0:00": "mean"}
    )

    return hourly_stats


def calculate_sleep_probability(row: pd.Series, patient_patterns: dict = None) -> float:
    """
    Calculates final sleep probability using both physiological indicators
    and learned patient patterns (if available)
    """
    # calculate probability from physiological indicators
    phys_score = calculate_physiological_sleep_score(row)

    # use patient pattern data if its available
    if patient_patterns is not None:
        hour = datetime.strptime(row["time"], "%H:%M:%S").hour
        hour_stats = patient_patterns.loc[hour]

        # get pattern score
        pattern_score = np.exp(
            -(
                hour_stats["steps-0:00"] / 100
                + hour_stats["hr-0:00"] / 100
                + hour_stats["cals-0:00"] / 100
            )
        )

        # combine scores
        # less weight on patient patterns since they dont always dictate what the patient is doing (ie they could be out later than usual at a concert)
        final_score = 0.85 * phys_score + 0.15 * pattern_score
    else:
        final_score = phys_score

    return np.clip(final_score, 0, 1)


def analyze_sleep_patterns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Analyzes sleep patterns for each patient individually
    """
    patient_patterns = {}

    # learn patient patterns
    for patient in data["p_num"].unique():
        patient_patterns[patient] = detect_sleep_clusters(data, patient)

    # calculate sleep probabilities
    data["sleep_probability"] = data.apply(
        lambda row: calculate_sleep_probability(
            row, patient_patterns.get(row["p_num"])
        ),
        axis=1,
    )

    return data


def main(data: pd.DataFrame) -> pd.DataFrame:
    """
    Main workflow function
    """
    clean_df = clean_data(data)
    results = analyze_sleep_patterns(clean_df)
    return results

1.0