In [4]:
import json
import pandas as pd

## Example Json that the bio model takes

In [2]:
path = "../../src/data/gluroo/chris/data.json"
with open(path, "r") as f:
    data = json.load(f)

df = pd.DataFrame(data["readingsForGroupId"]).sort_values("date", ascending=True)

## CONVERT RAW TO CACHED

In [1]:
from src.data.gluroo.gluroo import Gluroo
import pandas as pd

file_path = "../../src/data/gluroo/2024/---.csv"

config = {
    "max_consecutive_nan_values_per_day": 36,
    "coerse_time_interval": pd.Timedelta(minutes=5),  # Model is set to 5 min intervals
    "day_start_time": pd.Timedelta(hours=4),
    "min_carbs": 5,
    "meal_length": pd.Timedelta(hours=2),
    "n_top_carb_meals": 5,
}

keep_cols = [
    "date",
    "bgl",
    "msg_type",
    "dose_units",
    "food_g",
    # "dose_type", # WE NEED THIS
    "affects_fob",
    "affects_iob",
    "food_protein",
    "dose_automatic",
    "fp_bgl",
    "trend",
    "rx_entries_json",
    "event_type",
]

In [12]:
# Load and process the data - Process the raw data and add iob and cob
loader = Gluroo(
    keep_columns=keep_cols, file_path=file_path, config=config, use_cached=False
)

NameError: name 'keep_cols' is not defined

## START CONVERTING FROM CACHED TO JSON

In [13]:
from src.data.gluroo.gluroo import Gluroo
import pandas as pd

loader = Gluroo(
    use_cached=True,
)
cached_df = loader.processed_data
cached_df

Unnamed: 0.1,Unnamed: 0,datetime,bg-0:00,msg_type,dose_units,carbs-0:00,affects_fob,affects_iob,food_protein,dose_automatic,fp_bgl,trend,rx_entries_json,event_type,food_g_keep,day_start_shift,p_num,id,cob,carb_availability
0,0,2025-02-06 08:00:00+00:00,12.83,TEXT,,,False,False,,False,,FLAT,"[{""offsetIndex"": 0, ""startOffsetTt"": 14400, ""n...",rx_at_start,,2025-02-06,glu001,glu001_0,0.000000,0.000000
1,1,2025-02-06 08:05:00+00:00,12.72,ANNOUNCE_MEAL,1.05,12.0,False,True,,False,,FLAT,,cgm_reading,12.0,2025-02-06,glu001,glu001_1,9.600000,0.000000
2,2,2025-02-06 08:10:00+00:00,12.39,,,0.0,,,,,,FLAT,,cgm_reading,,2025-02-06,glu001,glu001_2,9.565189,0.889785
3,3,2025-02-06 08:15:00+00:00,11.56,,,0.0,,,,,,FORTYFIVE_DOWN,,cgm_reading,,2025-02-06,glu001,glu001_3,9.407829,1.763968
4,4,2025-02-06 08:20:00+00:00,10.89,,,0.0,,,,,,FORTYFIVE_DOWN,,cgm_reading,,2025-02-06,glu001,glu001_4,9.152655,2.417683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25903,25903,2025-05-07 06:35:00+00:00,6.50,,,,,,,,,FLAT,,cgm_reading,,2025-05-07,glu001,glu001_25903,0.000000,0.000000
25904,25904,2025-05-07 06:40:00+00:00,6.17,,,,,,,,,FLAT,,cgm_reading,,2025-05-07,glu001,glu001_25904,0.000000,0.000000
25905,25905,2025-05-07 06:45:00+00:00,6.11,,,,,,,,,FLAT,,cgm_reading,,2025-05-07,glu001,glu001_25905,0.000000,0.000000
25906,25906,2025-05-07 06:50:00+00:00,5.72,,,,,,,,,FLAT,,cgm_reading,,2025-05-07,glu001,glu001_25906,0.000000,0.000000


In [14]:
import dateutil.parser
import json

# This will need to be set dynamically when passing to the model
# This includes: readingsForGroupId (40 readings), rxDetails, dtNow, dtUntil, bgl and trend
prop = {
    "messages": [],
    "readingsForGroupId": [],
    "rxDetails": {},
    "dtNow": "2025-03-14T17:18:56.974+00:00",
    "dtUntil": "2025-03-14T17:18:56.969+00:00",
    "bgl": 69,
    "trend": "FORTYFIVE_DOWN",
    "offsetMin": 0,
    "isLooping": True,
    "timezone": "GMT",  # Data is converted to UTC in the data cleaner so need to set this to GMT from Los Angeles time
    "rapidInsulinDiaHours": -4,  # It's -4 in the database
}

# rxDetail
# rxDetail need to be set dynamically

### Convert each row to readings

In [15]:
data = []
processed_df = cached_df
processed_df["bg-0:00"] = (
    processed_df["bg-0:00"] * 18
).round()  # mmol/dL to mg/L (bio model uses mg/L)
for index, row in processed_df.iterrows():
    data.append(
        {
            "bgl": row["bg-0:00"],  # mmol/dL to mg/L (bio model uses mg/L)
            "millis": int(
                dateutil.parser.parse(
                    row["datetime"].strftime("%Y-%m-%d %H:%M:%S")
                ).timestamp()
                * 1000
            ),
            "date": row["datetime"].isoformat(),
            "trend": row["trend"],
        }
    )
prop["readingsForGroupId"] = data

In [16]:
"""
`originalDate` is the timestamp assigned to the message when it was first entered (iff the message later had its date/time changed).  
`date` is the timestamp that the message is currently assigned to and is the one used for predictions.
"""
message_types = [
    "ANNOUNCE_MEAL",
    "DOSE_INSULIN",
    "INTERVENTION_SNACK",
]
messages_df = processed_df[processed_df["msg_type"].isin(message_types)]

# TODO: Handle Nan values. Json doesn't support them?
messages = []
for index, row in messages_df.iterrows():
    messages.append(
        {
            "bgl": row["bg-0:00"],
            "bglDate": None,  # Not used in the model
            "fpBgl": row["fp_bgl"],  # Not used in the model
            "type": row["msg_type"],
            "date": row["datetime"].isoformat(),
            "originalDate": None,  # Not used in the model
            "actionMins": None,  # Not used in the model
            "exerciseMins": None,  # Not used in the model
            "exerciseLevel": None,  # Not used in the model
            "foodG": row["carbs-0:00"],
            "foodGlycemicIndex": 1
            if row["msg_type"] == "INTERVENTION_SNACK"
            else 0.5,  # 1 for intervention snacks (pure sugar) and 0.5 for everything else.
            "foodFat": None,  # Not used in the model
            "foodFiber": None,  # Not used in the model
            "foodProtein": row["food_protein"],
            "doseType": "HUMALOG",  # I thnk row['dose_type'] is the one to use but not in current csv
            "doseUnits": row["dose_units"],
            "doseAutomatic": row["dose_automatic"],
            "affectsFob": row["affects_fob"],
            "affectsIob": row["affects_iob"],
            "cancelledDate": None,  # A column for this?
        }
    )

for msg in messages:
    for k, v in msg.items():
        if pd.isna(v):
            msg[k] = None

prop["messages"] = messages

### Process the rxDetails

In [17]:
rx_details = processed_df[processed_df["rx_entries_json"].notna()]
rx_detail_list = []

# Reverse the order because the first entry is the latest readings (which is the last row in the df)
for index, row in rx_details.iloc[::-1].iterrows():
    entries = json.loads(row["rx_entries_json"])
    rxDetail = {
        "rxfd": {
            "groupId": 10,
            "dateCreated": row["datetime"].isoformat(),
            "splitOffsets": None,
            "rxEntriesByRxfdIdList": entries,
            "rxBasalsByRxfdIdList": [],
        },
        "rxEntries": entries,
        "rxBasals": [],
    }

    rx_detail_list.append(rxDetail)
prop["rxDetails"] = rx_detail_list

In [18]:
import json

file_path = "../../src/data/gluroo/data.json"
with open(file_path, "w") as f:
    json.dump(prop, f, indent=4)

## JSON TO CSV (NOT NEEDED AS WE ARE DOING THE OTHER WAY AROUND)

In [None]:
# def integrate_message_data(df: pd.DataFrame, messages: list) -> pd.DataFrame:
#     """
#     Integrates message data into the dataframe by matching to the closest timestamp.

#     Args:
#         df (pd.DataFrame): The input dataframe with datetime index
#         messages (list): List of message dictionaries containing type, foodG, doseUnits, and date

#     Returns:
#         pd.DataFrame: Updated dataframe with new message columns
#     """
#     # Create a copy to avoid modifying the original
#     result_df = df.copy()

#     # Initialize new columns if they don't exist
#     if 'type' not in result_df.columns:
#         result_df['type'] = None
#     if 'foodG' not in result_df.columns:
#         result_df['foodG'] = None
#     if 'doseUnits' not in result_df.columns:
#         result_df['doseUnits'] = None

#     # Convert message dates to datetime
#     message_dates = []
#     for msg in messages:
#         try:
#             date = pd.to_datetime(msg['date'])
#             message_dates.append(date)
#         except (KeyError, ValueError) as e:
#             print(f"Warning: Could not parse date for message: {msg}. Error: {e}")
#             continue

#     # For each message, find the closest timestamp in the dataframe
#     for msg, msg_date in zip(messages, message_dates):
#         try:
#             # Find the closest timestamp by comparing with the date column
#             time_diffs = abs(result_df['date'] - msg_date)
#             closest_idx = time_diffs.idxmin()

#             # Update the values at the closest timestamp
#             result_df.loc[closest_idx, 'type'] = msg.get('type')
#             result_df.loc[closest_idx, 'foodG'] = msg.get('foodG')
#             result_df.loc[closest_idx, 'doseUnits'] = msg.get('doseUnits')
#         except Exception as e:
#             print(f"Warning: Could not process message: {msg}. Error: {e}")
#             continue

#     return result_df

# # Usage:
# msg = data['messages']
# df_cpy = df.copy()
# df_cpy['date'] = pd.to_datetime(df_cpy['date'])
# df_cpy = integrate_message_data(df_cpy, msg)
# df_cpy = df_cpy.rename(columns={'type': 'msg_type'})
# df_cpy.drop(columns=['millis'], inplace=True)
# ins_df = df_cpy[df_cpy['msg_type'] == 'DOSE_INSULIN']
# df_cpy.to_csv('gluroo_data.csv')