In [None]:
from src.data.kaggle_brisT1D.data_loader import BrisT1DDataLoader
from src.tuning.benchmark import impute_missing_values

# Use 02, 03, 04
patient = "p02"


# Split up data into day and night
loader = BrisT1DDataLoader(use_cached=True)

train_df = loader.train_data[loader.train_data["p_num"] == patient]
test_df = loader.validation_data[loader.validation_data["p_num"] == patient]

TIME_STEP_SIZE = (
    train_df["datetime"].iloc[1] - train_df["datetime"].iloc[0]
).components.minutes


if TIME_STEP_SIZE != 5 and TIME_STEP_SIZE != 15:
    error = """
    First time step is not 5 or 15 minutes. Look at the most common time step size.
    """


def reduce_features(df):
    # Make sure index is set to datetime
    p_df = df.iloc[:]
    # p_df.set_index("datetime", inplace=True)
    # p_df = p_df.asfreq('5T', method='pad')

    # Reduce features
    y_feature = ["bg-0:00"]
    x_features = [
        # "hr-0:00", # -> has NaNs
        "steps-0:00",
        "cals-0:00",
        "carbs-0:00",
        "cob",
        "carb_availability",
        "insulin_availability",
        "iob",
    ]
    p_df = p_df[x_features + y_feature]

    # Impute with default methods
    p_df = impute_missing_values(p_df, columns=x_features)
    p_df = impute_missing_values(p_df, columns=y_feature)

    y, X = p_df[y_feature], p_df[x_features]
    return y, X


# train_df
# y_train, X_train = reduce_features(train_df)
# y_train
# train_df["carbs-0:00"].sample(1000)
# X_train["carbs-0:00"].sample(1000)

loader.raw_data.head(20)



Unnamed: 0,datetime,id,p_num,time,bg-0:00,insulin-0:00,carbs-0:00,hr-0:00,steps-0:00,cals-0:00,activity-0:00,cob,carb_availability,insulin_availability,iob
0,2025-01-01 06:10:00,p01_0,p01,06:10:00,15.1,0.0417,,,,,,0.0,0.0,0.0,0.4028
1,2025-01-01 06:25:00,p01_1,p01,06:25:00,14.4,0.0417,,,,,,0.0,0.0,0.003428,0.872082
2,2025-01-01 06:40:00,p01_2,p01,06:40:00,13.9,0.0417,,,,,,0.0,0.0,0.012039,1.385682
3,2025-01-01 06:55:00,p01_3,p01,06:55:00,13.8,0.0417,,,,,,0.0,0.0,0.024747,1.838095
4,2025-01-01 07:10:00,p01_4,p01,07:10:00,13.4,0.0417,,,,,,0.0,0.0,0.040416,2.203691
5,2025-01-01 07:25:00,p01_5,p01,07:25:00,12.8,0.0417,,,,,,0.0,0.0,0.057786,2.513159
6,2025-01-01 07:40:00,p01_6,p01,07:40:00,15.5,0.0417,20.0,,,,,16.0,0.0,0.075693,2.789246
7,2025-01-01 07:55:00,p01_7,p01,07:55:00,14.8,0.0417,,,,,,15.254426,4.029471,0.093394,3.053946
8,2025-01-01 08:10:00,p01_8,p01,08:10:00,12.7,0.0583,,,,,,13.387459,5.709357,0.110462,3.325296
9,2025-01-01 08:25:00,p01_9,p01,08:25:00,11.4,0.0583,,,,,,11.177291,5.925311,0.126989,3.630463


In [4]:
import pandas as pd

keep_cols = [
    "date",
    "bgl",
    "msg_type",  # I think this is called msg_type?
    "dose_units",  # We can convert this to iob
    "food_g",  # We can convert this to cob
    # 'food_glycemic_index',
    # 'affects_fob',
    # 'affects_iob',
    # 'trend',
]

#
path = "../../src/data/gluroo/2024/500030_2024-07-01_2024-09-30.csv"
patient_df = pd.read_csv(path, usecols=keep_cols)

# Rename type column to msg_type
if "type" in patient_df.columns:
    patient_df = patient_df.rename(columns={"type": "msg_type"})

patient_df.head()

Unnamed: 0,date,bgl,msg_type,dose_units,food_g
0,2024-07-01 00:02:39-05:00,98.0,,,
1,2024-07-01 00:07:39-05:00,100.0,,,
2,2024-07-01 00:12:39-05:00,98.0,,,
3,2024-07-01 00:17:39-05:00,94.0,,,
4,2024-07-01 00:22:40-05:00,94.0,,,


In [17]:
from src.data.gluroo.data_cleaner import (
    erase_meal_overlap_fn,
    keep_top_n_carb_meals,
    erase_consecutive_nan_values,
    coerce_time_fn,
    ensure_datetime_index,
    data_translation,
)

coerse_time_interval = pd.Timedelta(minutes=5)
day_start_time = pd.Timedelta(hours=0)
min_carbs = 5
meal_length = pd.Timedelta(hours=2)
n_top_carb_meals = 3
INTERVAL = 5

#
max_consecutive_nan_values_per_day = 36
print(f"Drop the entire day if {max_consecutive_nan_values_per_day*INTERVAL} mins")

df = ensure_datetime_index(patient_df)

# Remove timezone info from index to make dates more concise
df.index = df.index.tz_localize(None)


df = coerce_time_fn(data=df, coerse_time_interval=coerse_time_interval)


df["day_start_shift"] = (df.index - day_start_time).date

print("before erasing consecutive nan values", len(df))
df = erase_consecutive_nan_values(df, max_consecutive_nan_values_per_day)
print("after erasing consecutive nan values", len(df))

df = erase_meal_overlap_fn(df, meal_length, min_carbs)
df = keep_top_n_carb_meals(df, n_top_carb_meals=n_top_carb_meals)
df = data_translation(df)

df.head(20)

Drop the entire day if 180 mins
Columns after coercing time: ['bgl', 'msg_type', 'dose_units', 'food_g', 'food_g_keep']
before erasing consecutive nan values 26496
after erasing consecutive nan values 24704


Unnamed: 0_level_0,bg-0:00,dose_units,carbs-0:00,food_g_keep,day_start_shift
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-07-01 00:00:00,5.44,,,,2024-07-01
2024-07-01 00:05:00,5.56,,,,2024-07-01
2024-07-01 00:10:00,5.44,,,,2024-07-01
2024-07-01 00:15:00,5.22,,,,2024-07-01
2024-07-01 00:20:00,5.22,,,,2024-07-01
2024-07-01 00:25:00,5.5,,,,2024-07-01
2024-07-01 00:30:00,5.94,,,,2024-07-01
2024-07-01 00:35:00,6.33,,,,2024-07-01
2024-07-01 00:40:00,6.56,,,,2024-07-01
2024-07-01 00:45:00,6.61,,,,2024-07-01


In [1]:
from src.data.gluroo.gluroo import Gluroo
import pandas as pd

file_path = "../../src/data/gluroo/2024/500030_2024-07-01_2024-09-30.csv"
config = {
    "max_consecutive_nan_values_per_day": 36,
    "coerse_time_interval": pd.Timedelta(minutes=5),
    "day_start_time": pd.Timedelta(hours=4),
    "min_carbs": 5,
    "meal_length": pd.Timedelta(hours=2),
    "n_top_carb_meals": 3,
}

keep_cols = [
    "date",
    "bgl",
    "msg_type",  # I think this is called msg_type?
    "dose_units",  # We can convert this to iob
    "food_g",  # We can convert this to cob
]

loader = Gluroo(
    keep_columns=keep_cols,
    file_path=file_path,
    config=config,
)

Columns after coercing time: ['bgl', 'msg_type', 'dose_units', 'food_g', 'food_g_keep']


In [2]:
loader.processed_data.sample(20)

Unnamed: 0_level_0,bg-0:00,msg_type,dose_units,carbs-0:00,food_g_keep,day_start_shift,datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-09-16 22:20:00,6.28,,,,,2024-09-16,2024-09-16 22:20:00
2024-09-18 15:20:00,3.83,0,,0.0,10.0,2024-09-18,2024-09-18 15:20:00
2024-09-21 14:15:00,5.89,,,0.0,,2024-09-21,2024-09-21 14:15:00
2024-09-02 14:50:00,5.94,,,0.0,,2024-09-02,2024-09-02 14:50:00
2024-07-13 15:25:00,2.89,,,0.0,,2024-07-13,2024-07-13 15:25:00
2024-07-03 05:00:00,4.94,,,,,2024-07-03,2024-07-03 05:00:00
2024-07-13 07:30:00,10.78,,,,,2024-07-13,2024-07-13 07:30:00
2024-07-20 06:00:00,7.72,,,,,2024-07-20,2024-07-20 06:00:00
2024-09-14 21:55:00,6.83,,,,,2024-09-14,2024-09-14 21:55:00
2024-08-22 16:30:00,10.44,,,,,2024-08-22,2024-08-22 16:30:00


In [2]:
loader.validation_data

Unnamed: 0_level_0,bg-0:00,msg_type,dose_units,carbs-0:00,food_g_keep,day_start_shift,datetime,p_num
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-09-10 06:55:00,5.83,,,,,2024-09-10,2024-09-10 06:55:00,glu001
2024-09-10 07:00:00,5.44,,,,,2024-09-10,2024-09-10 07:00:00,glu001
2024-09-10 07:05:00,5.67,,,,,2024-09-10,2024-09-10 07:05:00,glu001
2024-09-10 07:10:00,5.94,,,,,2024-09-10,2024-09-10 07:10:00,glu001
2024-09-10 07:15:00,5.83,,,,,2024-09-10,2024-09-10 07:15:00,glu001
...,...,...,...,...,...,...,...,...
2024-09-30 06:35:00,5.06,,,,,2024-09-30,2024-09-30 06:35:00,glu001
2024-09-30 06:40:00,4.78,,,,,2024-09-30,2024-09-30 06:40:00,glu001
2024-09-30 06:45:00,4.67,,,,,2024-09-30,2024-09-30 06:45:00,glu001
2024-09-30 06:50:00,4.94,,,,,2024-09-30,2024-09-30 06:50:00,glu001


In [1]:
import json
import pandas as pd

path = "../../src/data/gluroo/chris/data.json"
with open(path, "r") as f:
    data = json.load(f)

df = pd.DataFrame(data["readingsForGroupId"]).sort_values("date", ascending=True)

In [2]:
def integrate_message_data(df: pd.DataFrame, messages: list) -> pd.DataFrame:
    """
    Integrates message data into the dataframe by matching to the closest timestamp.

    Args:
        df (pd.DataFrame): The input dataframe with datetime index
        messages (list): List of message dictionaries containing type, foodG, doseUnits, and date

    Returns:
        pd.DataFrame: Updated dataframe with new message columns
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()

    # Initialize new columns if they don't exist
    if "type" not in result_df.columns:
        result_df["type"] = None
    if "foodG" not in result_df.columns:
        result_df["foodG"] = None
    if "doseUnits" not in result_df.columns:
        result_df["doseUnits"] = None

    # Convert message dates to datetime
    message_dates = []
    for msg in messages:
        try:
            date = pd.to_datetime(msg["date"])
            message_dates.append(date)
        except (KeyError, ValueError) as e:
            print(f"Warning: Could not parse date for message: {msg}. Error: {e}")
            continue

    # For each message, find the closest timestamp in the dataframe
    for msg, msg_date in zip(messages, message_dates):
        try:
            # Find the closest timestamp by comparing with the date column
            time_diffs = abs(result_df["date"] - msg_date)
            closest_idx = time_diffs.idxmin()

            # Update the values at the closest timestamp
            result_df.loc[closest_idx, "type"] = msg.get("type")
            result_df.loc[closest_idx, "foodG"] = msg.get("foodG")
            result_df.loc[closest_idx, "doseUnits"] = msg.get("doseUnits")
        except Exception as e:
            print(f"Warning: Could not process message: {msg}. Error: {e}")
            continue

    return result_df


# Usage:
msg = data["messages"]
df_cpy = df.copy()
df_cpy["date"] = pd.to_datetime(df_cpy["date"])
df_cpy = integrate_message_data(df_cpy, msg)
df_cpy = df_cpy.rename(columns={"type": "msg_type"})
df_cpy.drop(columns=["millis"], inplace=True)

In [4]:
ins_df = df_cpy[df_cpy["msg_type"] == "DOSE_INSULIN"]

In [3]:
df_cpy.to_csv("gluroo_data.csv")

In [62]:
import dateutil.parser

prop = {
    "messages": [],
    "readingsForGroupId": [],
    "rxDetails": {},
    "dtNow": "2025-03-14T17:18:56.974+00:00",
    "dtUntil": "2025-03-14T17:18:56.969+00:00",
    "bgl": 69,
    "trend": "FORTYFIVE_DOWN",
    "offsetMin": 0,
    "isLooping": True,
    "timezone": "America/Los_Angeles",
    "rapidInsulinDiaHours": -5,
}

prop["rxDetails"] = (
    {
        "rxfd": {
            "groupId": 10,
            "dateCreated": "2025-01-30T00:25:47.118865+00:00",
            "splitOffsets": None,
            "rxEntriesByRxfdIdList": [
                {
                    "offsetIndex": 0,
                    "startOffsetTt": 14400,
                    "name": "Early morn",
                    "carbRatio": 13.5,
                    "isf": 85,
                    "insulinResistance": 1,
                    "target": 100,
                    "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                    "groupId": 10,
                },
                {
                    "offsetIndex": 1,
                    "startOffsetTt": 23400,
                    "name": "Breakfast",
                    "carbRatio": 11.5,
                    "isf": 85,
                    "insulinResistance": 1,
                    "target": 105,
                    "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                    "groupId": 10,
                },
                {
                    "offsetIndex": 2,
                    "startOffsetTt": 37800,
                    "name": "Lunch",
                    "carbRatio": 13.5,
                    "isf": 18,
                    "insulinResistance": 1,
                    "target": 110,
                    "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                    "groupId": 10,
                },
                {
                    "offsetIndex": 3,
                    "startOffsetTt": 52200,
                    "name": "Aft&Dinner",
                    "carbRatio": 13.5,
                    "isf": 85,
                    "insulinResistance": 1,
                    "target": 110,
                    "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                    "groupId": 10,
                },
                {
                    "offsetIndex": 4,
                    "startOffsetTt": 79200,
                    "name": "Night",
                    "carbRatio": 13.5,
                    "isf": 85,
                    "insulinResistance": 1,
                    "target": 110,
                    "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                    "groupId": 10,
                },
            ],
            "rxBasalsByRxfdIdList": [],
        },
        "rxEntries": [
            {
                "offsetIndex": 0,
                "startOffsetTt": 14400,
                "name": "Early morn",
                "carbRatio": 13.5,
                "isf": 85,
                "insulinResistance": 1,
                "target": 100,
                "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                "groupId": 10,
            },
            {
                "offsetIndex": 1,
                "startOffsetTt": 23400,
                "name": "Breakfast",
                "carbRatio": 11.5,
                "isf": 85,
                "insulinResistance": 1,
                "target": 105,
                "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                "groupId": 10,
            },
            {
                "offsetIndex": 2,
                "startOffsetTt": 37800,
                "name": "Lunch",
                "carbRatio": 13.5,
                "isf": 18,
                "insulinResistance": 1,
                "target": 110,
                "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                "groupId": 10,
            },
            {
                "offsetIndex": 3,
                "startOffsetTt": 52200,
                "name": "Aft&Dinner",
                "carbRatio": 13.5,
                "isf": 85,
                "insulinResistance": 1,
                "target": 110,
                "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                "groupId": 10,
            },
            {
                "offsetIndex": 4,
                "startOffsetTt": 79200,
                "name": "Night",
                "carbRatio": 13.5,
                "isf": 85,
                "insulinResistance": 1,
                "target": 110,
                "dateCreated": "2025-01-30T00:25:47.118865+00:00",
                "groupId": 10,
            },
        ],
        "rxBasals": [],
    },
)

In [None]:
raw_data = pd.read_csv(
    "./500030_2025-01-29_2025-04-29 - 500030_2025-01-29_2025-04-29.csv"
)
readings = raw_data[raw_data["__typename"] == "Reading"]
readings.columns

Index(['date', 'sender_id', 'bgl', 'bgl_date_millis', 'text', 'template',
       'msg_type', 'affects_fob', 'affects_iob', 'dose_units', 'food_g',
       'food_glycemic_index', 'dose_automatic', 'fp_bgl',
       'message_basal_change', '__typename', 'trend'],
      dtype='object')

In [38]:
data = []
for index, row in readings.iterrows():
    data.append(
        {
            "bgl": row["bgl"],
            "millis": int(dateutil.parser.parse(row["date"]).timestamp() * 1000),
            "date": row["date"],
            "trend": row["trend"],
        }
    )
prop["readingsForGroupId"] = data

In [48]:
messages_df = raw_data[raw_data["msg_type"].notna()]
messages_df

Unnamed: 0,date,sender_id,bgl,bgl_date_millis,text,template,msg_type,affects_fob,affects_iob,dose_units,food_g,food_glycemic_index,dose_automatic,fp_bgl,message_basal_change,__typename,trend
276,2025-01-29 09:36:06.845000-06:00,1057.0,118.0,1.738165e+12,Dosed 2u,,DOSE_INSULIN,False,True,2.0,,,False,,,Message,
367,2025-01-29 12:47:40.596000-06:00,1057.0,92.0,1.738176e+12,Dosed 5u,,DOSE_INSULIN,False,True,5.0,,,False,,,Message,
378,2025-01-29 13:04:37.193000-06:00,1057.0,85.0,1.738177e+12,"45g chicken salad, wasa, mandarin, mixed nuts,...",,ANNOUNCE_MEAL,True,False,,45.0,0.5,False,,,Message,
417,2025-01-29 14:19:04.741000-06:00,1057.0,196.0,1.738182e+12,15m medium walk,,ANNOUNCE_EXERCISE,False,True,,,,False,,,Message,
463,2025-01-29 15:49:50.586000-06:00,1057.0,112.0,1.738187e+12,*`Updated Prescription`*:\n • _`Morning`_ car...,,TEXT,False,False,,,,False,,,Message,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49046,2025-04-29 19:13:11.074000-05:00,1057.0,86.0,1.745972e+12,"50g pizza, salad",,ANNOUNCE_MEAL,True,False,,50.0,0.5,False,,,Message,
49074,2025-04-29 20:16:58.176000-05:00,1057.0,78.0,1.745976e+12,12g m&ms,,ANNOUNCE_MEAL,True,False,,12.0,0.5,False,,,Message,
49100,2025-04-29 21:22:47.983000-05:00,1057.0,161.0,1.745980e+12,Basal of 22u (Tresiba),,DOSE_BASAL_INSULIN,False,True,22.0,,,False,,,Message,
49101,2025-04-29 21:22:54.117000-05:00,1057.0,161.0,1.745980e+12,3.5u,,DOSE_INSULIN,False,True,3.5,,,False,,,Message,


In [None]:
"""
`originalDate` is the timestamp assigned to the message when it was first entered (iff the message later had its date/time changed).  
`date` is the timestamp that the message is currently assigned to and is the one used for predictions.
"""

# TODO: Handle Nan values. Json doesn't support them?
messages = []
for index, row in messages_df.iterrows():
    messages.append(
        {
            "bgl": row["bgl"],
            "bglDate": row["date"],  # Not used in the model
            "fpBgl": row["fp_bgl"],  # Not used in the model
            "type": row["msg_type"],
            "date": row["date"],
            "originalDate": row["date"],  # Not used in the model
            "actionMins": None,  # Not used in the model
            "exerciseMins": None,  # Not used in the model
            "exerciseLevel": None,  # Not used in the model
            "foodG": row["food_g"],
            "foodGlycemicIndex": None,  # 1 for intervention snacks (pure sugar) and 0.5 for everything else.
            "foodFat": None,  # Not enought date so can assume 0
            "foodFiber": None,
            "foodProtein": None,
            "doseType": "HUMALOG",  # I thnk row['dose_type'] is the one to use but not in current csv
            "doseUnits": row["dose_units"],
            "doseAutomatic": False,  # A column for this?
            "affectsFob": row["affects_fob"],
            "affectsIob": row["affects_iob"],
            "cancelledDate": None,  # A column for this?
        }
    )

for msg in messages:
    for k, v in msg.items():
        if pd.isna(v):
            msg[k] = None

prop["messages"] = messages

In [61]:
import json

with open("./data.json", "w") as f:
    json.dump(prop, f, indent=4)