In [3]:
from src.data.kaggle_brisT1D.data_loader import BrisT1DDataLoader
from src.tuning.benchmark import impute_missing_values

# Use 02, 03, 04
patient = "p02"


# Split up data into day and night
loader = BrisT1DDataLoader(use_cached=True)

train_df = loader.train_data[loader.train_data["p_num"] == patient]
test_df = loader.validation_data[loader.validation_data["p_num"] == patient]

TIME_STEP_SIZE = (
    train_df["datetime"].iloc[1] - train_df["datetime"].iloc[0]
).components.minutes


if TIME_STEP_SIZE != 5 and TIME_STEP_SIZE != 15:
    error = """
    First time step is not 5 or 15 minutes. Look at the most common time step size.
    """


def reduce_features(df):
    # Make sure index is set to datetime
    p_df = df.iloc[:]
    # p_df.set_index("datetime", inplace=True)
    # p_df = p_df.asfreq('5T', method='pad')

    # Reduce features
    y_feature = ["bg-0:00"]
    x_features = [
        # "hr-0:00", # -> has NaNs
        "steps-0:00",
        "cals-0:00",
        "carbs-0:00",
        "cob",
        "carb_availability",
        "insulin_availability",
        "iob",
    ]
    p_df = p_df[x_features + y_feature]

    # Impute with default methods
    p_df = impute_missing_values(p_df, columns=x_features)
    p_df = impute_missing_values(p_df, columns=y_feature)

    y, X = p_df[y_feature], p_df[x_features]
    return y, X


# train_df
# y_train, X_train = reduce_features(train_df)
# y_train
# train_df["carbs-0:00"].sample(1000)
# X_train["carbs-0:00"].sample(1000)

loader.raw_data.head(20)



Unnamed: 0,datetime,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00,cob,carb_availability,insulin_availability,iob
0,2025-01-01 06:10:00,p01_0,p01,06:10:00,15.1,0.0417,,,,,,0.0,0.0,0.0,0.4028
1,2025-01-01 06:25:00,p01_1,p01,06:25:00,14.4,0.0417,,,,,,0.0,0.0,0.003428,0.872082
2,2025-01-01 06:40:00,p01_2,p01,06:40:00,13.9,0.0417,,,,,,0.0,0.0,0.012039,1.385682
3,2025-01-01 06:55:00,p01_3,p01,06:55:00,13.8,0.0417,,,,,,0.0,0.0,0.024747,1.838095
4,2025-01-01 07:10:00,p01_4,p01,07:10:00,13.4,0.0417,,,,,,0.0,0.0,0.040416,2.203691
5,2025-01-01 07:25:00,p01_5,p01,07:25:00,12.8,0.0417,,,,,,0.0,0.0,0.057786,2.513159
6,2025-01-01 07:40:00,p01_6,p01,07:40:00,15.5,0.0417,20.0,,,,,16.0,0.0,0.075693,2.789246
7,2025-01-01 07:55:00,p01_7,p01,07:55:00,14.8,0.0417,,,,,,15.254426,4.029471,0.093394,3.053946
8,2025-01-01 08:10:00,p01_8,p01,08:10:00,12.7,0.0583,,,,,,13.387459,5.709357,0.110462,3.325296
9,2025-01-01 08:25:00,p01_9,p01,08:25:00,11.4,0.0583,,,,,,11.177291,5.925311,0.126989,3.630463


In [4]:
import pandas as pd

keep_cols = [
    "date",
    "bgl",
    "msg_type",  # I think this is called msg_type?
    "dose_units",  # We can convert this to iob
    "food_g",  # We can convert this to cob
    # 'food_glycemic_index',
    # 'affects_fob',
    # 'affects_iob',
    # 'trend',
]

#
path = "../../src/data/gluroo/2024/500030_2024-07-01_2024-09-30.csv"
patient_df = pd.read_csv(path, usecols=keep_cols)

# Rename type column to msg_type
if "type" in patient_df.columns:
    patient_df = patient_df.rename(columns={"type": "msg_type"})

patient_df.head()

Unnamed: 0,date,bgl,msg_type,dose_units,food_g
0,2024-07-01 00:02:39-05:00,98.0,,,
1,2024-07-01 00:07:39-05:00,100.0,,,
2,2024-07-01 00:12:39-05:00,98.0,,,
3,2024-07-01 00:17:39-05:00,94.0,,,
4,2024-07-01 00:22:40-05:00,94.0,,,


In [17]:
from src.data.gluroo.data_cleaner import (
    erase_meal_overlap_fn,
    keep_top_n_carb_meals,
    erase_consecutive_nan_values,
    coerce_time_fn,
    ensure_datetime_index,
    data_translation,
)

coerse_time_interval = pd.Timedelta(minutes=5)
day_start_time = pd.Timedelta(hours=0)
min_carbs = 5
meal_length = pd.Timedelta(hours=2)
n_top_carb_meals = 3
INTERVAL = 5

#
max_consecutive_nan_values_per_day = 36
print(f"Drop the entire day if {max_consecutive_nan_values_per_day*INTERVAL} mins")

df = ensure_datetime_index(patient_df)

# Remove timezone info from index to make dates more concise
df.index = df.index.tz_localize(None)


df = coerce_time_fn(data=df, coerse_time_interval=coerse_time_interval)


df["day_start_shift"] = (df.index - day_start_time).date

print("before erasing consecutive nan values", len(df))
df = erase_consecutive_nan_values(df, max_consecutive_nan_values_per_day)
print("after erasing consecutive nan values", len(df))

df = erase_meal_overlap_fn(df, meal_length, min_carbs)
df = keep_top_n_carb_meals(df, n_top_carb_meals=n_top_carb_meals)
df = data_translation(df)

df.head(20)

Drop the entire day if 180 mins
Columns after coercing time: ['bgl', 'msg_type', 'dose_units', 'food_g', 'food_g_keep']
before erasing consecutive nan values 26496
after erasing consecutive nan values 24704


Unnamed: 0_level_0,bg-0:00,dose_units,carbs-0:00,food_g_keep,day_start_shift
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-07-01 00:00:00,5.44,,,,2024-07-01
2024-07-01 00:05:00,5.56,,,,2024-07-01
2024-07-01 00:10:00,5.44,,,,2024-07-01
2024-07-01 00:15:00,5.22,,,,2024-07-01
2024-07-01 00:20:00,5.22,,,,2024-07-01
2024-07-01 00:25:00,5.5,,,,2024-07-01
2024-07-01 00:30:00,5.94,,,,2024-07-01
2024-07-01 00:35:00,6.33,,,,2024-07-01
2024-07-01 00:40:00,6.56,,,,2024-07-01
2024-07-01 00:45:00,6.61,,,,2024-07-01


In [2]:
from src.data.gluroo.gluroo import Gluroo
import pandas as pd

file_path = "../../src/data/gluroo/2024/500030_2024-07-01_2024-09-30.csv"
config = {
    "max_consecutive_nan_values_per_day": 36,
    "coerse_time_interval": pd.Timedelta(minutes=5),
    "day_start_time": pd.Timedelta(hours=4),
    "min_carbs": 5,
    "meal_length": pd.Timedelta(hours=2),
    "n_top_carb_meals": 3,
}

keep_cols = [
    "date",
    "bgl",
    "msg_type",  # I think this is called msg_type?
    "dose_units",  # We can convert this to iob
    "food_g",  # We can convert this to cob
]

loader = Gluroo(
    keep_columns=keep_cols,
    file_path=file_path,
    config=config,
)

Columns after coercing time: ['bgl', 'msg_type', 'dose_units', 'food_g', 'food_g_keep']


In [7]:
loader.processed_data.sample(20)

Unnamed: 0_level_0,bg-0:00,msg_type,dose_units,carbs-0:00,food_g_keep,day_start_shift
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-19 04:45:00,3.22,,,,,2024-07-19
2024-09-15 19:40:00,4.94,,,0.0,,2024-09-15
2024-07-15 12:35:00,7.44,,,,,2024-07-15
2024-09-08 00:55:00,5.39,,,,,2024-09-07
2024-08-15 10:10:00,10.11,,,,,2024-08-15
2024-09-03 10:30:00,6.5,,,,,2024-09-03
2024-07-04 17:50:00,7.39,,,0.0,,2024-07-04
2024-07-17 15:35:00,11.28,,,0.0,,2024-07-17
2024-09-16 09:45:00,4.78,,,,,2024-09-16
2024-07-06 19:05:00,7.67,,,0.0,,2024-07-06
